From 0c5a3a6fdb945686fedb1d1881acf03f413e895f Mon Sep 17 00:00:00 2001 From: Matthias van de Meent Date: Tue, 30 Jul 2024 19:57:15 +0200 Subject: [PATCH 01/50] Mark pg_prewarm as a trusted extension It doesn't have any inherently unsafe functions that allow users to do things they otherwise wouldn't be able to do so; except more efficiently. --- contrib/pg_prewarm/pg_prewarm.control | 1 + 1 file changed, 1 insertion(+) diff --git a/contrib/pg_prewarm/pg_prewarm.control b/contrib/pg_prewarm/pg_prewarm.control index 40e3add4810..d40d1a000b7 100644 --- a/contrib/pg_prewarm/pg_prewarm.control +++ b/contrib/pg_prewarm/pg_prewarm.control @@ -3,3 +3,4 @@ comment = 'prewarm relation data' default_version = '1.2' module_pathname = '$libdir/pg_prewarm' relocatable = true +trusted = true From 094c96afa9b5096d5f4f893c51bedf4b4f506e8d Mon Sep 17 00:00:00 2001 From: Matthias van de Meent Date: Tue, 30 Jul 2024 19:59:58 +0200 Subject: [PATCH 02/50] NEON: Fix BRIN redo in PS' WAL-redo process By using the WAL record as source of the block number, we don't have to rely on buffer pinning, which doesn't work in the WAL-redo process. --- src/backend/access/brin/brin_xlog.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/backend/access/brin/brin_xlog.c b/src/backend/access/brin/brin_xlog.c index 85db2f0fd5a..280fae5aaf0 100644 --- a/src/backend/access/brin/brin_xlog.c +++ b/src/backend/access/brin/brin_xlog.c @@ -69,7 +69,8 @@ brin_xlog_insert_update(XLogReaderState *record, } /* need this page's blkno to store in revmap */ - regpgno = BufferGetBlockNumber(buffer); + /* NEON XXX Don't use BufferGetBlockNumber because wal-redo doesn't pin buffer. */ + XLogRecGetBlockTag(record, 0, NULL, NULL, ®pgno); /* insert the index item into the page */ if (action == BLK_NEEDS_REDO) From 524a0bc1d22964abf5bf475f1282ae0525318c06 Mon Sep 17 00:00:00 2001 From: Matthias van de Meent Date: Tue, 30 Jul 2024 21:04:19 +0200 Subject: [PATCH 03/50] NEON: Replace tuple INSERT/UPDATE/DELETE records with custom ones The vanilla records don't have cid, which is required for persistence during PS WAL redo. --- src/backend/access/heap/heapam.c | 125 +++++++++++++++-------- src/backend/access/heap/heapam_xlog.c | 15 ++- src/include/access/heapam_xlog.h | 1 + src/include/access/neon_xlog.h | 136 ++++++++++++++++++++++++++ 4 files changed, 233 insertions(+), 44 deletions(-) create mode 100644 src/include/access/neon_xlog.h diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index 0dcd6ee817e..abe7957d191 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -35,6 +35,7 @@ #include "access/heaptoast.h" #include "access/hio.h" #include "access/multixact.h" +#include "access/neon_xlog.h" #include "access/subtrans.h" #include "access/syncscan.h" #include "access/valid.h" @@ -2154,11 +2155,11 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid, /* XLOG stuff */ if (RelationNeedsWAL(relation)) { - xl_heap_insert xlrec; - xl_heap_header xlhdr; + xl_neon_heap_insert xlrec; + xl_neon_heap_header xlhdr; XLogRecPtr recptr; Page page = BufferGetPage(buffer); - uint8 info = XLOG_HEAP_INSERT; + uint8 info = XLOG_NEON_HEAP_INSERT; int bufflags = 0; /* @@ -2176,7 +2177,7 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid, if (ItemPointerGetOffsetNumber(&(heaptup->t_self)) == FirstOffsetNumber && PageGetMaxOffsetNumber(page) == FirstOffsetNumber) { - info |= XLOG_HEAP_INIT_PAGE; + info |= XLOG_NEON_INIT_PAGE; bufflags |= REGBUF_WILL_INIT; } @@ -2204,11 +2205,12 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid, } XLogBeginInsert(); - XLogRegisterData(&xlrec, SizeOfHeapInsert); + XLogRegisterData(&xlrec, SizeOfNeonHeapInsert); xlhdr.t_infomask2 = heaptup->t_data->t_infomask2; xlhdr.t_infomask = heaptup->t_data->t_infomask; xlhdr.t_hoff = heaptup->t_data->t_hoff; + xlhdr.t_cid = HeapTupleHeaderGetRawCommandId(heaptup->t_data); /* * note we mark xlhdr as belonging to buffer; if XLogInsert decides to @@ -2216,7 +2218,7 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid, * xl_heap_header in the xlog. */ XLogRegisterBuffer(0, buffer, REGBUF_STANDARD | bufflags); - XLogRegisterBufData(0, &xlhdr, SizeOfHeapHeader); + XLogRegisterBufData(0, &xlhdr, SizeOfNeonHeapHeader); /* PG73FORMAT: write bitmap [+ padding] [+ oid] + data */ XLogRegisterBufData(0, (char *) heaptup->t_data + SizeofHeapTupleHeader, @@ -2225,14 +2227,25 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid, /* filtering by origin on a row level is much more efficient */ XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN); - recptr = XLogInsert(RM_HEAP_ID, info); + recptr = XLogInsert(RM_NEON_ID, info); PageSetLSN(page, recptr); } END_CRIT_SECTION(); - UnlockReleaseBuffer(buffer); + if (options & HEAP_INSERT_SPECULATIVE) + { + /* + * NEON: speculative token is not stored in WAL, so if the page is evicted + * from the buffer cache, the token will be lost. To prevent that, we keep the + * buffer pinned. It will be unpinned in heapam_tuple_finish/abort_speculative. + */ + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + } + else + UnlockReleaseBuffer(buffer); + if (vmbuffer != InvalidBuffer) ReleaseBuffer(vmbuffer); @@ -2518,8 +2531,8 @@ heap_multi_insert(Relation relation, TupleTableSlot **slots, int ntuples, if (needwal) { XLogRecPtr recptr; - xl_heap_multi_insert *xlrec; - uint8 info = XLOG_HEAP2_MULTI_INSERT; + xl_neon_heap_multi_insert *xlrec; + uint8 info = XLOG_NEON_HEAP_MULTI_INSERT; char *tupledata; int totaldatalen; char *scratchptr = scratch.data; @@ -2532,9 +2545,9 @@ heap_multi_insert(Relation relation, TupleTableSlot **slots, int ntuples, */ init = starting_with_empty_page; - /* allocate xl_heap_multi_insert struct from the scratch area */ - xlrec = (xl_heap_multi_insert *) scratchptr; - scratchptr += SizeOfHeapMultiInsert; + /* allocate xl_neon_heap_multi_insert struct from the scratch area */ + xlrec = (xl_neon_heap_multi_insert *) scratchptr; + scratchptr += SizeOfNeonHeapMultiInsert; /* * Allocate offsets array. Unless we're reinitializing the page, @@ -2566,17 +2579,25 @@ heap_multi_insert(Relation relation, TupleTableSlot **slots, int ntuples, for (i = 0; i < nthispage; i++) { HeapTuple heaptup = heaptuples[ndone + i]; - xl_multi_insert_tuple *tuphdr; + xl_neon_multi_insert_tuple *tuphdr; int datalen; if (!init) xlrec->offsets[i] = ItemPointerGetOffsetNumber(&heaptup->t_self); /* xl_multi_insert_tuple needs two-byte alignment. */ - tuphdr = (xl_multi_insert_tuple *) SHORTALIGN(scratchptr); - scratchptr = ((char *) tuphdr) + SizeOfMultiInsertTuple; + tuphdr = (xl_neon_multi_insert_tuple *) SHORTALIGN(scratchptr); + scratchptr = ((char *) tuphdr) + SizeOfNeonMultiInsertTuple; tuphdr->t_infomask2 = heaptup->t_data->t_infomask2; tuphdr->t_infomask = heaptup->t_data->t_infomask; + if (i == 0) + { + xlrec->t_cid = HeapTupleHeaderGetRawCommandId(heaptup->t_data); + } + else + { + Assert(xlrec->t_cid == HeapTupleHeaderGetRawCommandId(heaptup->t_data)); + } tuphdr->t_hoff = heaptup->t_data->t_hoff; /* write bitmap [+ padding] [+ oid] + data */ @@ -2603,7 +2624,7 @@ heap_multi_insert(Relation relation, TupleTableSlot **slots, int ntuples, if (init) { - info |= XLOG_HEAP_INIT_PAGE; + info |= XLOG_NEON_INIT_PAGE; bufflags |= REGBUF_WILL_INIT; } @@ -2623,7 +2644,7 @@ heap_multi_insert(Relation relation, TupleTableSlot **slots, int ntuples, /* filtering by origin on a row level is much more efficient */ XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN); - recptr = XLogInsert(RM_HEAP2_ID, info); + recptr = XLogInsert(RM_NEON_ID, info); PageSetLSN(page, recptr); } @@ -2728,6 +2749,7 @@ compute_infobits(uint16 infomask, uint16 infomask2) ((infomask & HEAP_XMAX_IS_MULTI) != 0 ? XLHL_XMAX_IS_MULTI : 0) | ((infomask & HEAP_XMAX_LOCK_ONLY) != 0 ? XLHL_XMAX_LOCK_ONLY : 0) | ((infomask & HEAP_XMAX_EXCL_LOCK) != 0 ? XLHL_XMAX_EXCL_LOCK : 0) | + ((infomask & HEAP_COMBOCID) != 0 ? XLHL_COMBOCID : 0) | /* note we ignore HEAP_XMAX_SHR_LOCK here */ ((infomask & HEAP_XMAX_KEYSHR_LOCK) != 0 ? XLHL_XMAX_KEYSHR_LOCK : 0) | ((infomask2 & HEAP_KEYS_UPDATED) != 0 ? @@ -3077,8 +3099,8 @@ heap_delete(Relation relation, ItemPointer tid, */ if (RelationNeedsWAL(relation)) { - xl_heap_delete xlrec; - xl_heap_header xlhdr; + xl_neon_heap_delete xlrec; + xl_neon_heap_header xlhdr; XLogRecPtr recptr; /* @@ -3097,6 +3119,7 @@ heap_delete(Relation relation, ItemPointer tid, tp.t_data->t_infomask2); xlrec.offnum = ItemPointerGetOffsetNumber(&tp.t_self); xlrec.xmax = new_xmax; + xlrec.t_cid = HeapTupleHeaderGetRawCommandId(tp.t_data); if (old_key_tuple != NULL) { @@ -3107,7 +3130,7 @@ heap_delete(Relation relation, ItemPointer tid, } XLogBeginInsert(); - XLogRegisterData(&xlrec, SizeOfHeapDelete); + XLogRegisterData(&xlrec, SizeOfNeonHeapDelete); XLogRegisterBuffer(0, buffer, REGBUF_STANDARD); @@ -3118,9 +3141,10 @@ heap_delete(Relation relation, ItemPointer tid, { xlhdr.t_infomask2 = old_key_tuple->t_data->t_infomask2; xlhdr.t_infomask = old_key_tuple->t_data->t_infomask; + xlhdr.t_cid = HeapTupleHeaderGetRawCommandId(old_key_tuple->t_data); xlhdr.t_hoff = old_key_tuple->t_data->t_hoff; - XLogRegisterData(&xlhdr, SizeOfHeapHeader); + XLogRegisterData(&xlhdr, SizeOfNeonHeapHeader); XLogRegisterData((char *) old_key_tuple->t_data + SizeofHeapTupleHeader, old_key_tuple->t_len @@ -3130,7 +3154,7 @@ heap_delete(Relation relation, ItemPointer tid, /* filtering by origin on a row level is much more efficient */ XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN); - recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_DELETE); + recptr = XLogInsert(RM_NEON_ID, XLOG_NEON_HEAP_DELETE); PageSetLSN(page, recptr); } @@ -3876,7 +3900,7 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup, if (RelationNeedsWAL(relation)) { - xl_heap_lock xlrec; + xl_neon_heap_lock xlrec; XLogRecPtr recptr; XLogBeginInsert(); @@ -3888,8 +3912,9 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup, oldtup.t_data->t_infomask2); xlrec.flags = cleared_all_frozen ? XLH_LOCK_ALL_FROZEN_CLEARED : 0; - XLogRegisterData(&xlrec, SizeOfHeapLock); - recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_LOCK); + xlrec.t_cid = HeapTupleHeaderGetRawCommandId(oldtup.t_data); + XLogRegisterData(&xlrec, SizeOfNeonHeapLock); + recptr = XLogInsert(RM_NEON_ID, XLOG_NEON_HEAP_LOCK); PageSetLSN(page, recptr); } @@ -5208,7 +5233,7 @@ heap_lock_tuple(Relation relation, HeapTuple tuple, */ if (RelationNeedsWAL(relation)) { - xl_heap_lock xlrec; + xl_neon_heap_lock xlrec; XLogRecPtr recptr; XLogBeginInsert(); @@ -5219,11 +5244,12 @@ heap_lock_tuple(Relation relation, HeapTuple tuple, xlrec.infobits_set = compute_infobits(new_infomask, tuple->t_data->t_infomask2); xlrec.flags = cleared_all_frozen ? XLH_LOCK_ALL_FROZEN_CLEARED : 0; - XLogRegisterData(&xlrec, SizeOfHeapLock); + xlrec.t_cid = HeapTupleHeaderGetRawCommandId(tuple->t_data); + XLogRegisterData(&xlrec, SizeOfNeonHeapLock); /* we don't decode row locks atm, so no need to log the origin */ - recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_LOCK); + recptr = XLogInsert(RM_NEON_ID, XLOG_NEON_HEAP_LOCK); PageSetLSN(page, recptr); } @@ -6135,6 +6161,7 @@ heap_finish_speculative(Relation relation, ItemPointer tid) END_CRIT_SECTION(); + ReleaseBuffer(buffer); /* NEON: release buffer pinned by heap_insert */ UnlockReleaseBuffer(buffer); } @@ -6206,6 +6233,16 @@ heap_abort_speculative(Relation relation, ItemPointer tid) elog(ERROR, "attempted to kill a non-speculative tuple"); Assert(!HeapTupleHeaderIsHeapOnly(tp.t_data)); + /* + * NEON: release buffer pinned by heap_insert + * + * This function is also used on the toast tuples of an aborted speculative + * insertion. For those, there is no token on the tuple, and we didn' t keep + * the pin. + */ + if (HeapTupleHeaderIsSpeculative(tp.t_data)) + ReleaseBuffer(buffer); + /* * No need to check for serializable conflicts here. There is never a * need for a combo CID, either. No need to extract replica identity, or @@ -6260,7 +6297,7 @@ heap_abort_speculative(Relation relation, ItemPointer tid) */ if (RelationNeedsWAL(relation)) { - xl_heap_delete xlrec; + xl_neon_heap_delete xlrec; XLogRecPtr recptr; xlrec.flags = XLH_DELETE_IS_SUPER; @@ -6268,14 +6305,15 @@ heap_abort_speculative(Relation relation, ItemPointer tid) tp.t_data->t_infomask2); xlrec.offnum = ItemPointerGetOffsetNumber(&tp.t_self); xlrec.xmax = xid; + xlrec.t_cid = HeapTupleHeaderGetRawCommandId(tp.t_data); XLogBeginInsert(); - XLogRegisterData(&xlrec, SizeOfHeapDelete); + XLogRegisterData(&xlrec, SizeOfNeonHeapDelete); XLogRegisterBuffer(0, buffer, REGBUF_STANDARD); /* No replica identity & replication origin logged */ - recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_DELETE); + recptr = XLogInsert(RM_NEON_ID, XLOG_NEON_HEAP_DELETE); PageSetLSN(page, recptr); } @@ -8833,9 +8871,9 @@ log_heap_update(Relation reln, Buffer oldbuf, HeapTuple old_key_tuple, bool all_visible_cleared, bool new_all_visible_cleared) { - xl_heap_update xlrec; - xl_heap_header xlhdr; - xl_heap_header xlhdr_idx; + xl_neon_heap_update xlrec; + xl_neon_heap_header xlhdr; + xl_neon_heap_header xlhdr_idx; uint8 info; uint16 prefix_suffix[2]; uint16 prefixlen = 0, @@ -8852,9 +8890,9 @@ log_heap_update(Relation reln, Buffer oldbuf, XLogBeginInsert(); if (HeapTupleIsHeapOnly(newtup)) - info = XLOG_HEAP_HOT_UPDATE; + info = XLOG_NEON_HEAP_HOT_UPDATE; else - info = XLOG_HEAP_UPDATE; + info = XLOG_NEON_HEAP_UPDATE; /* * If the old and new tuple are on the same page, we only need to log the @@ -8934,7 +8972,7 @@ log_heap_update(Relation reln, Buffer oldbuf, if (ItemPointerGetOffsetNumber(&(newtup->t_self)) == FirstOffsetNumber && PageGetMaxOffsetNumber(page) == FirstOffsetNumber) { - info |= XLOG_HEAP_INIT_PAGE; + info |= XLOG_NEON_INIT_PAGE; init = true; } else @@ -8945,6 +8983,7 @@ log_heap_update(Relation reln, Buffer oldbuf, xlrec.old_xmax = HeapTupleHeaderGetRawXmax(oldtup->t_data); xlrec.old_infobits_set = compute_infobits(oldtup->t_data->t_infomask, oldtup->t_data->t_infomask2); + xlrec.t_cid = HeapTupleHeaderGetRawCommandId(oldtup->t_data); /* Prepare WAL data for the new page */ xlrec.new_offnum = ItemPointerGetOffsetNumber(&newtup->t_self); @@ -8960,7 +8999,7 @@ log_heap_update(Relation reln, Buffer oldbuf, if (oldbuf != newbuf) XLogRegisterBuffer(1, oldbuf, REGBUF_STANDARD); - XLogRegisterData(&xlrec, SizeOfHeapUpdate); + XLogRegisterData(&xlrec, SizeOfNeonHeapUpdate); /* * Prepare WAL data for the new tuple. @@ -8986,6 +9025,7 @@ log_heap_update(Relation reln, Buffer oldbuf, xlhdr.t_infomask2 = newtup->t_data->t_infomask2; xlhdr.t_infomask = newtup->t_data->t_infomask; xlhdr.t_hoff = newtup->t_data->t_hoff; + xlhdr.t_cid = HeapTupleHeaderGetRawCommandId(newtup->t_data); Assert(SizeofHeapTupleHeader + prefixlen + suffixlen <= newtup->t_len); /* @@ -8993,7 +9033,7 @@ log_heap_update(Relation reln, Buffer oldbuf, * * The 'data' doesn't include the common prefix or suffix. */ - XLogRegisterBufData(0, &xlhdr, SizeOfHeapHeader); + XLogRegisterBufData(0, &xlhdr, SizeOfNeonHeapHeader); if (prefixlen == 0) { XLogRegisterBufData(0, @@ -9027,8 +9067,9 @@ log_heap_update(Relation reln, Buffer oldbuf, xlhdr_idx.t_infomask2 = old_key_tuple->t_data->t_infomask2; xlhdr_idx.t_infomask = old_key_tuple->t_data->t_infomask; xlhdr_idx.t_hoff = old_key_tuple->t_data->t_hoff; + xlhdr_idx.t_cid = HeapTupleHeaderGetRawCommandId(old_key_tuple->t_data); - XLogRegisterData(&xlhdr_idx, SizeOfHeapHeader); + XLogRegisterData(&xlhdr_idx, SizeOfNeonHeapHeader); /* PG73FORMAT: write bitmap [+ padding] [+ oid] + data */ XLogRegisterData((char *) old_key_tuple->t_data + SizeofHeapTupleHeader, @@ -9038,7 +9079,7 @@ log_heap_update(Relation reln, Buffer oldbuf, /* filtering by origin on a row level is much more efficient */ XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN); - recptr = XLogInsert(RM_HEAP_ID, info); + recptr = XLogInsert(RM_NEON_ID, info); return recptr; } diff --git a/src/backend/access/heap/heapam_xlog.c b/src/backend/access/heap/heapam_xlog.c index eb4bd3d6ae3..a201efceb5e 100644 --- a/src/backend/access/heap/heapam_xlog.c +++ b/src/backend/access/heap/heapam_xlog.c @@ -227,7 +227,16 @@ heap_xlog_visible(XLogReaderState *record) PageSetAllVisible(page); - if (XLogHintBitIsNeeded()) + /* + * NEON: despite to the comment above we need to update page LSN here. + * See discussion at hackers: https://www.postgresql.org/message-id/flat/039076d4f6cdd871691686361f83cb8a6913a86a.camel%40j-davis.com#101ba42b004f9988e3d54fce26fb3462 + * For Neon this assignment is critical because otherwise last written LSN tracked at compute doesn't + * match with page LSN assignee by WAL-redo and as a result, prefetched page is rejected. + * + * It is fixed in upstream in https://github.com/neondatabase/postgres/commit/7bf713dd2d0739fbcd4103971ed69c17ebe677ea + * but until it is merged we still need to carry a patch here. + */ + if (true || XLogHintBitIsNeeded()) PageSetLSN(page, lsn); MarkBufferDirty(buffer); @@ -317,7 +326,7 @@ static void fix_infomask_from_infobits(uint8 infobits, uint16 *infomask, uint16 *infomask2) { *infomask &= ~(HEAP_XMAX_IS_MULTI | HEAP_XMAX_LOCK_ONLY | - HEAP_XMAX_KEYSHR_LOCK | HEAP_XMAX_EXCL_LOCK); + HEAP_XMAX_KEYSHR_LOCK | HEAP_XMAX_EXCL_LOCK | HEAP_COMBOCID); *infomask2 &= ~HEAP_KEYS_UPDATED; if (infobits & XLHL_XMAX_IS_MULTI) @@ -326,6 +335,8 @@ fix_infomask_from_infobits(uint8 infobits, uint16 *infomask, uint16 *infomask2) *infomask |= HEAP_XMAX_LOCK_ONLY; if (infobits & XLHL_XMAX_EXCL_LOCK) *infomask |= HEAP_XMAX_EXCL_LOCK; + if (infobits & XLHL_COMBOCID) + *infomask |= HEAP_COMBOCID; /* note HEAP_XMAX_SHR_LOCK isn't considered here */ if (infobits & XLHL_XMAX_KEYSHR_LOCK) *infomask |= HEAP_XMAX_KEYSHR_LOCK; diff --git a/src/include/access/heapam_xlog.h b/src/include/access/heapam_xlog.h index 277df6b3cf0..05a9f15f461 100644 --- a/src/include/access/heapam_xlog.h +++ b/src/include/access/heapam_xlog.h @@ -388,6 +388,7 @@ typedef struct xlhp_prune_items #define XLHL_XMAX_EXCL_LOCK 0x04 #define XLHL_XMAX_KEYSHR_LOCK 0x08 #define XLHL_KEYS_UPDATED 0x10 +#define XLHL_COMBOCID 0x20 /* flag bits for xl_heap_lock / xl_heap_lock_updated's flag field */ #define XLH_LOCK_ALL_FROZEN_CLEARED 0x01 diff --git a/src/include/access/neon_xlog.h b/src/include/access/neon_xlog.h new file mode 100644 index 00000000000..1f29d526d70 --- /dev/null +++ b/src/include/access/neon_xlog.h @@ -0,0 +1,136 @@ +#ifndef NEON_XLOG_H +#define NEON_XLOG_H + +#include "storage/off.h" + +/* + * The RMGR id of the Neon RMGR + * + * Reserved at https://wiki.postgresql.org/wiki/CustomWALResourceManagers + */ +#define RM_NEON_ID 134 + +#define XLOG_NEON_INIT_PAGE 0x80 +#define XLOG_NEON_OPMASK ((~XLOG_NEON_INIT_PAGE) & XLR_RMGR_INFO_MASK) + +/* from XLOG_HEAP_* */ +#define XLOG_NEON_HEAP_INSERT 0x00 +#define XLOG_NEON_HEAP_DELETE 0x10 +#define XLOG_NEON_HEAP_UPDATE 0x20 +#define XLOG_NEON_HEAP_HOT_UPDATE 0x30 +#define XLOG_NEON_HEAP_LOCK 0x40 +/* from XLOG_HEAP2_* */ +#define XLOG_NEON_HEAP_MULTI_INSERT 0x50 +/* 2 variants available */ + +/* */ +typedef struct xl_neon_heap_header { + uint16 t_infomask2; + uint16 t_infomask; + uint32 t_cid; + uint8 t_hoff; +} xl_neon_heap_header; + +#define SizeOfNeonHeapHeader (offsetof(xl_neon_heap_header, t_hoff) + sizeof(uint8)) + +/* This is what we need to know about insert */ +typedef struct xl_neon_heap_insert +{ + OffsetNumber offnum; /* inserted tuple's offset */ + uint8 flags; + + /* xl_neon_heap_header & TUPLE DATA in backup block 0 */ +} xl_neon_heap_insert; + +#define SizeOfNeonHeapInsert (offsetof(xl_neon_heap_insert, flags) + sizeof(uint8)) + +/* This is what we need to know about delete */ +typedef struct xl_neon_heap_delete +{ + TransactionId xmax; /* xmax of the deleted tuple */ + OffsetNumber offnum; /* deleted tuple's offset */ + uint8 infobits_set; /* infomask bits */ + uint8 flags; + uint32 t_cid; +} xl_neon_heap_delete; + +#define SizeOfNeonHeapDelete (offsetof(xl_neon_heap_delete, t_cid) + sizeof(uint32)) + +/* + * This is what we need to know about update|hot_update + * + * Backup blk 0: new page + * + * If XLH_UPDATE_PREFIX_FROM_OLD or XLH_UPDATE_SUFFIX_FROM_OLD flags are set, + * the prefix and/or suffix come first, as one or two uint16s. + * + * After that, xl_neon_heap_header and new tuple data follow. The new tuple + * data doesn't include the prefix and suffix, which are copied from the + * old tuple on replay. + * + * If XLH_UPDATE_CONTAINS_NEW_TUPLE flag is given, the tuple data is + * included even if a full-page image was taken. + * + * Backup blk 1: old page, if different. (no data, just a reference to the blk) + */ +typedef struct xl_neon_heap_update +{ + TransactionId old_xmax; /* xmax of the old tuple */ + OffsetNumber old_offnum; /* old tuple's offset */ + uint8 old_infobits_set; /* infomask bits to set on old tuple */ + uint8 flags; + uint32 t_cid; + TransactionId new_xmax; /* xmax of the new tuple */ + OffsetNumber new_offnum; /* new tuple's offset */ + + /* + * If XLH_UPDATE_CONTAINS_OLD_TUPLE or XLH_UPDATE_CONTAINS_OLD_KEY flags + * are set, xl_neon_heap_header and tuple data for the old tuple follow. + */ +} xl_neon_heap_update; +#define SizeOfNeonHeapUpdate (offsetof(xl_neon_heap_update, new_offnum) + sizeof(OffsetNumber)) + +typedef struct xl_neon_heap_lock +{ + TransactionId xmax; /* might be a MultiXactId */ + uint32 t_cid; + OffsetNumber offnum; /* locked tuple's offset on page */ + uint8 infobits_set; /* infomask and infomask2 bits to set */ + uint8 flags; /* XLH_LOCK_* flag bits */ +} xl_neon_heap_lock; + +#define SizeOfNeonHeapLock (offsetof(xl_neon_heap_lock, flags) + sizeof(uint8)) + +/* + * This is what we need to know about a multi-insert. + * + * The main data of the record consists of this xl_neon_heap_multi_insert header. + * 'offsets' array is omitted if the whole page is reinitialized + * (XLOG_HEAP_INIT_PAGE). + * + * In block 0's data portion, there is an xl_neon_multi_insert_tuple struct, + * followed by the tuple data for each tuple. There is padding to align + * each xl_neon_multi_insert_tuple struct. + */ +typedef struct xl_neon_heap_multi_insert +{ + uint8 flags; + uint16 ntuples; + uint32 t_cid; + OffsetNumber offsets[FLEXIBLE_ARRAY_MEMBER]; +} xl_neon_heap_multi_insert; + +#define SizeOfNeonHeapMultiInsert offsetof(xl_neon_heap_multi_insert, offsets) + +typedef struct xl_neon_multi_insert_tuple +{ + uint16 datalen; /* size of tuple data that follows */ + uint16 t_infomask2; + uint16 t_infomask; + uint8 t_hoff; + /* TUPLE DATA FOLLOWS AT END OF STRUCT */ +} xl_neon_multi_insert_tuple; + +#define SizeOfNeonMultiInsertTuple (offsetof(xl_neon_multi_insert_tuple, t_hoff) + sizeof(uint8)) + +#endif //NEON_XLOG_H From 9e737ae90e7a3aa38bcceb800325e250506037ff Mon Sep 17 00:00:00 2001 From: Matthias van de Meent Date: Tue, 30 Jul 2024 21:11:00 +0200 Subject: [PATCH 04/50] NEON: Update Sequence logging to 0 (from 32) A sequence's page can be evicted quite quickly with low shared_buffers configurations, which then causes values to be skipped. This prevents that, at the cost of higher WAL volume when sequences are involved. --- src/backend/commands/sequence.c | 8 +++++++- src/test/regress/expected/sequence.out | 2 +- src/test/regress/sql/sequence.sql | 2 +- 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/src/backend/commands/sequence.c b/src/backend/commands/sequence.c index 451ae6f7f69..30faf79898a 100644 --- a/src/backend/commands/sequence.c +++ b/src/backend/commands/sequence.c @@ -55,7 +55,13 @@ * so we pre-log a few fetches in advance. In the event of * crash we can lose (skip over) as many values as we pre-logged. */ -#define SEQ_LOG_VALS 32 +/* + * NEON XXX: to ensure sequential order of sequence in NEON without + * too large gaps even without restarts, we need to WAL log each + * sequence update. + */ +/* #define SEQ_LOG_VALS 32 */ +#define SEQ_LOG_VALS 0 /* * The "special area" of a sequence's buffer page looks like this. diff --git a/src/test/regress/expected/sequence.out b/src/test/regress/expected/sequence.out index 15925d99c8a..b8bd7466f1d 100644 --- a/src/test/regress/expected/sequence.out +++ b/src/test/regress/expected/sequence.out @@ -260,7 +260,7 @@ SELECT nextval('foo_seq_new'); -- log_cnt can be higher if there is a checkpoint just at the right -- time, so just test for the expected range -SELECT last_value, log_cnt IN (31, 32) AS log_cnt_ok, is_called FROM foo_seq_new; +SELECT last_value, log_cnt IN (0, 31, 32) AS log_cnt_ok, is_called FROM foo_seq_new; last_value | log_cnt_ok | is_called ------------+------------+----------- 2 | t | t diff --git a/src/test/regress/sql/sequence.sql b/src/test/regress/sql/sequence.sql index 2c220b60749..5872c355d5c 100644 --- a/src/test/regress/sql/sequence.sql +++ b/src/test/regress/sql/sequence.sql @@ -137,7 +137,7 @@ SELECT nextval('foo_seq_new'); SELECT nextval('foo_seq_new'); -- log_cnt can be higher if there is a checkpoint just at the right -- time, so just test for the expected range -SELECT last_value, log_cnt IN (31, 32) AS log_cnt_ok, is_called FROM foo_seq_new; +SELECT last_value, log_cnt IN (0, 31, 32) AS log_cnt_ok, is_called FROM foo_seq_new; DROP SEQUENCE foo_seq_new; -- renaming serial sequences From d18681049f15d20f328eeff7fe1531e9c6d37a2f Mon Sep 17 00:00:00 2001 From: Matthias van de Meent Date: Wed, 31 Jul 2024 12:54:17 +0200 Subject: [PATCH 05/50] NEON: Allow the user to specify a cluster ID in initdb --- src/backend/access/transam/xlog.c | 2 ++ src/backend/bootstrap/bootstrap.c | 12 +++++++++++- src/bin/initdb/initdb.c | 11 ++++++++++- src/include/bootstrap/bootstrap.h | 2 +- 4 files changed, 24 insertions(+), 3 deletions(-) diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 184de54f3a1..3a9494a552f 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -63,6 +63,7 @@ #include "access/xlogrecovery.h" #include "access/xlogutils.h" #include "backup/basebackup.h" +#include "bootstrap/bootstrap.h" #include "catalog/catversion.h" #include "catalog/pg_control.h" #include "catalog/pg_database.h" @@ -136,6 +137,7 @@ int wal_retrieve_retry_interval = 5000; int max_slot_wal_keep_size_mb = -1; int wal_decode_buffer_size = 512 * 1024; bool track_wal_io_timing = false; +uint64 predefined_sysidentifier; #ifdef WAL_DEBUG bool XLOG_DEBUG = false; diff --git a/src/backend/bootstrap/bootstrap.c b/src/backend/bootstrap/bootstrap.c index 6db864892d0..e2acd940cbb 100644 --- a/src/backend/bootstrap/bootstrap.c +++ b/src/backend/bootstrap/bootstrap.c @@ -218,13 +218,23 @@ BootstrapModeMain(int argc, char *argv[], bool check_only) argv++; argc--; - while ((flag = getopt(argc, argv, "B:c:d:D:Fkr:X:-:")) != -1) + while ((flag = getopt(argc, argv, "B:c:d:D:Fkr:s:X:-:")) != -1) { switch (flag) { case 'B': SetConfigOption("shared_buffers", optarg, PGC_POSTMASTER, PGC_S_ARGV); break; + case 's': + { + char* endptr; +#ifdef HAVE_STRTOULL + predefined_sysidentifier = strtoull(optarg, &endptr, 10); +#else + predefined_sysidentifier = strtoul(optarg, &endptr, 10); +#endif + break; + } case '-': /* diff --git a/src/bin/initdb/initdb.c b/src/bin/initdb/initdb.c index 62bbd08d9f6..990b7fba222 100644 --- a/src/bin/initdb/initdb.c +++ b/src/bin/initdb/initdb.c @@ -227,6 +227,7 @@ static const char *const backend_options = "--single -F -O -j -c search_path=pg_ /* Additional switches to pass to backend (either boot or standalone) */ static char *extra_options = ""; +static char *extra_boot_options = ""; static const char *const subdirs[] = { "global", @@ -1609,7 +1610,9 @@ bootstrap_template1(void) initPQExpBuffer(&cmd); - printfPQExpBuffer(&cmd, "\"%s\" --boot %s %s", backend_exec, boot_options, extra_options); + printfPQExpBuffer(&cmd, "\"%s\" --boot %s %s %s", + backend_exec, boot_options, extra_boot_options, + extra_options); appendPQExpBuffer(&cmd, " -X %d", wal_segment_size_mb * (1024 * 1024)); if (data_checksums) appendPQExpBufferStr(&cmd, " -k"); @@ -3198,6 +3201,7 @@ main(int argc, char *argv[]) {"sync-method", required_argument, NULL, 19}, {"no-data-checksums", no_argument, NULL, 20}, {"no-sync-data-files", no_argument, NULL, 21}, + {"sysid", required_argument, NULL, 22}, {NULL, 0, NULL, 0} }; @@ -3395,6 +3399,11 @@ main(int argc, char *argv[]) case 21: sync_data_files = false; break; + case 22: + extra_boot_options = psprintf("%s -s %s", + extra_boot_options, + optarg); + break; default: /* getopt_long already emitted a complaint */ pg_log_error_hint("Try \"%s --help\" for more information.", progname); diff --git a/src/include/bootstrap/bootstrap.h b/src/include/bootstrap/bootstrap.h index befc4fa1b3d..f83b6f04f8d 100644 --- a/src/include/bootstrap/bootstrap.h +++ b/src/include/bootstrap/bootstrap.h @@ -31,7 +31,7 @@ extern PGDLLIMPORT Relation boot_reldesc; extern PGDLLIMPORT Form_pg_attribute attrtypes[MAXATTR]; extern PGDLLIMPORT int numattr; - +extern PGDLLIMPORT uint64 predefined_sysidentifier; pg_noreturn extern void BootstrapModeMain(int argc, char *argv[], bool check_only); From 9ac839bb6da82cb974ba3287b0cad8fd94b2a0b8 Mon Sep 17 00:00:00 2001 From: Matthias van de Meent Date: Wed, 31 Jul 2024 13:53:37 +0200 Subject: [PATCH 06/50] NEON: Add a LwLsnCache hooks to keep track of recent smgr-level changes This allows us to use lower (on average) LSNs in requests to PS, thus increasing the WAL lag tolerance of the pageserver. Note that callers to smgr_end_unlogged_build don't have to update LwLsnCache, as that call internally handles it. --- src/backend/access/gist/gistbuild.c | 13 +++++ src/backend/access/transam/xlog.c | 47 +++++++++++++++++-- src/backend/catalog/storage.c | 5 +- src/backend/commands/dbcommands.c | 30 +++++++++++- .../utils/activity/wait_event_names.txt | 1 + src/include/access/xlog.h | 29 ++++++++++++ src/include/storage/lwlocklist.h | 1 + 7 files changed, 119 insertions(+), 7 deletions(-) diff --git a/src/backend/access/gist/gistbuild.c b/src/backend/access/gist/gistbuild.c index 9e707167d98..11ab63d6a8d 100644 --- a/src/backend/access/gist/gistbuild.c +++ b/src/backend/access/gist/gistbuild.c @@ -38,6 +38,8 @@ #include "access/gist_private.h" #include "access/tableam.h" #include "access/xloginsert.h" +#include "catalog/index.h" +#include "catalog/storage.h" #include "miscadmin.h" #include "nodes/execnodes.h" #include "optimizer/optimizer.h" @@ -448,6 +450,17 @@ gist_indexsortbuild(GISTBuildState *state) memcpy(rootbuf, levelstate->pages[0], BLCKSZ); smgr_bulk_write(state->bulkstate, GIST_ROOT_BLKNO, rootbuf, true); + if (RelationNeedsWAL(state->indexrel)) + { + XLogRecPtr lsn = GetRedoRecPtr(); + + if (set_lwlsn_block_hook) + set_lwlsn_block_hook(lsn, state->indexrel->rd_smgr->smgr_rlocator.locator, + MAIN_FORKNUM, GIST_ROOT_BLKNO); + if (set_lwlsn_relation_hook) + set_lwlsn_relation_hook(lsn, state->indexrel->rd_smgr->smgr_rlocator.locator, MAIN_FORKNUM); + } + pfree(levelstate); smgr_bulk_finish(state->bulkstate); diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 3a9494a552f..3fe774ed157 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -84,6 +84,7 @@ #include "replication/walreceiver.h" #include "replication/walsender.h" #include "storage/bufmgr.h" +#include "storage/buf_internals.h" #include "storage/fd.h" #include "storage/ipc.h" #include "storage/large_object.h" @@ -145,6 +146,14 @@ bool XLOG_DEBUG = false; int wal_segment_size = DEFAULT_XLOG_SEG_SIZE; +/* NEON: Hooks to facilitate the last written LSN cache */ +set_lwlsn_block_hook_type set_lwlsn_block_hook = NULL; +set_lwlsn_block_range_hook_type set_lwlsn_block_range_hook = NULL; +set_lwlsn_block_v_hook_type set_lwlsn_block_v_hook = NULL; +set_lwlsn_db_hook_type set_lwlsn_db_hook = NULL; +set_lwlsn_relation_hook_type set_lwlsn_relation_hook = NULL; +set_max_lwlsn_hook_type set_max_lwlsn_hook = NULL; + /* * Number of WAL insertion locks to use. A higher value allows more insertions * to happen concurrently, but adds some CPU overhead to flushing the WAL, @@ -204,6 +213,7 @@ const struct config_enum_entry archive_mode_options[] = { {NULL, 0, false} }; + /* * Statistics for current checkpoint are collected in this global struct. * Because only the checkpointer or a stand-alone backend can perform @@ -564,6 +574,9 @@ typedef struct XLogCtlData */ XLogRecPtr lastFpwDisableRecPtr; + /* neon: copy of startup's RedoStartLSN for walproposer's use */ + XLogRecPtr RedoStartLSN; + slock_t info_lck; /* locks shared variables shown above */ } XLogCtlData; @@ -5235,10 +5248,17 @@ BootStrapXLOG(uint32 data_checksum_version) * determine the initialization time of the installation, which could * perhaps be useful sometimes. */ - gettimeofday(&tv, NULL); - sysidentifier = ((uint64) tv.tv_sec) << 32; - sysidentifier |= ((uint64) tv.tv_usec) << 12; - sysidentifier |= getpid() & 0xFFF; + if (predefined_sysidentifier != 0) + { + sysidentifier = predefined_sysidentifier; + } + else + { + gettimeofday(&tv, NULL); + sysidentifier = ((uint64) tv.tv_sec) << 32; + sysidentifier |= ((uint64) tv.tv_usec) << 12; + sysidentifier |= getpid() & 0xFFF; + } /* page buffer must be aligned suitably for O_DIRECT */ buffer = (char *) palloc(XLOG_BLCKSZ + XLOG_BLCKSZ); @@ -5867,6 +5887,9 @@ StartupXLOG(void) RedoRecPtr = XLogCtl->RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo; doPageWrites = lastFullPageWrites; + if (set_max_lwlsn_hook) + set_max_lwlsn_hook(RedoRecPtr); + /* REDO */ if (InRecovery) { @@ -6671,6 +6694,22 @@ GetInsertRecPtr(void) return recptr; } +void +SetRedoStartLsn(XLogRecPtr RedoStartLSN) +{ + XLogCtl->RedoStartLSN = RedoStartLSN; +} + +/* + * RedoStartLsn is set only once by startup process, locking is not required + * after its exit. + */ +XLogRecPtr +GetRedoStartLsn(void) +{ + return XLogCtl->RedoStartLSN; +} + /* * GetFlushRecPtr -- Returns the current flush position, ie, the last WAL * position known to be fsync'd to disk. This should only be used on a diff --git a/src/backend/catalog/storage.c b/src/backend/catalog/storage.c index 227df90f89c..5ba81adda4c 100644 --- a/src/backend/catalog/storage.c +++ b/src/backend/catalog/storage.c @@ -187,6 +187,7 @@ void log_smgrcreate(const RelFileLocator *rlocator, ForkNumber forkNum) { xl_smgr_create xlrec; + XLogRecPtr lsn; /* * Make an XLOG entry reporting the file creation. @@ -196,7 +197,9 @@ log_smgrcreate(const RelFileLocator *rlocator, ForkNumber forkNum) XLogBeginInsert(); XLogRegisterData(&xlrec, sizeof(xlrec)); - XLogInsert(RM_SMGR_ID, XLOG_SMGR_CREATE | XLR_SPECIAL_REL_UPDATE); + lsn = XLogInsert(RM_SMGR_ID, XLOG_SMGR_CREATE | XLR_SPECIAL_REL_UPDATE); + if (set_lwlsn_relation_hook) + set_lwlsn_relation_hook(lsn, *rlocator, forkNum); } /* diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c index ef99375d72b..9bc14aa1b49 100644 --- a/src/backend/commands/dbcommands.c +++ b/src/backend/commands/dbcommands.c @@ -1551,6 +1551,12 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt) CreateDatabaseUsingFileCopy(src_dboid, dboid, src_deftablespace, dst_deftablespace); + /* + * Update global last written LSN after wal-logging create database command + */ + if (set_lwlsn_db_hook) + set_lwlsn_db_hook(XactLastRecEnd); + /* * Close pg_database, but keep lock till commit. */ @@ -2202,6 +2208,7 @@ movedb(const char *dbname, const char *tblspcname) */ { xl_dbase_create_file_copy_rec xlrec; + XLogRecPtr lsn; xlrec.db_id = db_id; xlrec.tablespace_id = dst_tblspcoid; @@ -2212,8 +2219,11 @@ movedb(const char *dbname, const char *tblspcname) XLogRegisterData(&xlrec, sizeof(xl_dbase_create_file_copy_rec)); - (void) XLogInsert(RM_DBASE_ID, - XLOG_DBASE_CREATE_FILE_COPY | XLR_SPECIAL_REL_UPDATE); + lsn = XLogInsert(RM_DBASE_ID, + XLOG_DBASE_CREATE_FILE_COPY | XLR_SPECIAL_REL_UPDATE); + // TODO: Do we really need to set the LSN here? + if (set_lwlsn_db_hook) + set_lwlsn_db_hook(lsn); } /* @@ -3382,6 +3392,14 @@ dbase_redo(XLogReaderState *record) */ copydir(src_path, dst_path, false); + /* + * Make sure any future requests to the page server see the new + * database. + */ + if (set_lwlsn_db_hook) + set_lwlsn_db_hook(record->EndRecPtr); + + pfree(src_path); pfree(dst_path); } @@ -3402,6 +3420,14 @@ dbase_redo(XLogReaderState *record) /* Create the database directory with the version file. */ CreateDirAndVersionFile(dbpath, xlrec->db_id, xlrec->tablespace_id, true); + + /* + * Make sure any future requests to the page server see the new + * database. + */ + if (set_lwlsn_db_hook) + set_lwlsn_db_hook(record->EndRecPtr); + pfree(dbpath); } else if (info == XLOG_DBASE_DROP) diff --git a/src/backend/utils/activity/wait_event_names.txt b/src/backend/utils/activity/wait_event_names.txt index 4da68312b5f..7123dedb55b 100644 --- a/src/backend/utils/activity/wait_event_names.txt +++ b/src/backend/utils/activity/wait_event_names.txt @@ -352,6 +352,7 @@ DSMRegistry "Waiting to read or update the dynamic shared memory registry." InjectionPoint "Waiting to read or update information related to injection points." SerialControl "Waiting to read or update shared pg_serial state." AioWorkerSubmissionQueue "Waiting to access AIO worker submission queue." +LastWrittenLsn "Waiting to read or update information related to written recent changes." # # END OF PREDEFINED LWLOCKS (DO NOT CHANGE THIS LINE) diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h index d313099c027..5a5d7b37ac1 100644 --- a/src/include/access/xlog.h +++ b/src/include/access/xlog.h @@ -16,6 +16,8 @@ #include "datatype/timestamp.h" #include "lib/stringinfo.h" #include "nodes/pg_list.h" +#include "storage/block.h" +#include "storage/relfilelocator.h" /* Sync methods */ @@ -33,6 +35,12 @@ extern PGDLLIMPORT XLogRecPtr ProcLastRecPtr; extern PGDLLIMPORT XLogRecPtr XactLastRecEnd; extern PGDLLIMPORT XLogRecPtr XactLastCommitEnd; +/* + * Pseudo block number used to associate LSN with relation metadata (relation size) + */ +#define REL_METADATA_PSEUDO_BLOCKNO InvalidBlockNumber + + /* these variables are GUC parameters related to XLOG */ extern PGDLLIMPORT int wal_segment_size; extern PGDLLIMPORT int min_wal_size_mb; @@ -59,6 +67,7 @@ extern PGDLLIMPORT int wal_decode_buffer_size; extern PGDLLIMPORT int CheckPointSegments; + /* Archive modes */ typedef enum ArchiveMode { @@ -255,11 +264,31 @@ extern TimeLineID GetWALInsertionTimeLine(void); extern TimeLineID GetWALInsertionTimeLineIfSet(void); extern XLogRecPtr GetLastImportantRecPtr(void); +/* neon specifics */ + +extern void SetRedoStartLsn(XLogRecPtr RedoStartLSN); +extern XLogRecPtr GetRedoStartLsn(void); + extern void SetWalWriterSleeping(bool sleeping); extern Size WALReadFromBuffers(char *dstbuf, XLogRecPtr startptr, Size count, TimeLineID tli); +/* Hooks for LwLSN */ +typedef XLogRecPtr (*set_lwlsn_block_hook_type)(XLogRecPtr lsn, RelFileLocator relfilenode, ForkNumber forknum, BlockNumber blkno); +typedef XLogRecPtr (*set_lwlsn_block_range_hook_type)(XLogRecPtr lsn, RelFileLocator relfilenode, ForkNumber forknum, BlockNumber from, BlockNumber n_blocks); +typedef XLogRecPtr (*set_lwlsn_block_v_hook_type)(const XLogRecPtr *lsns, RelFileLocator relfilenode, ForkNumber forknum, BlockNumber blockno, int nblocks); +typedef XLogRecPtr (*set_lwlsn_db_hook_type)(XLogRecPtr lsn); +typedef XLogRecPtr (*set_lwlsn_relation_hook_type)(XLogRecPtr lsn, RelFileLocator relfilenode, ForkNumber forknum); +typedef void (*set_max_lwlsn_hook_type) (XLogRecPtr lsn); + +extern set_lwlsn_block_hook_type set_lwlsn_block_hook; +extern set_lwlsn_block_range_hook_type set_lwlsn_block_range_hook; +extern set_lwlsn_block_v_hook_type set_lwlsn_block_v_hook; +extern set_lwlsn_db_hook_type set_lwlsn_db_hook; +extern set_lwlsn_relation_hook_type set_lwlsn_relation_hook; +extern set_max_lwlsn_hook_type set_max_lwlsn_hook; + /* * Routines used by xlogrecovery.c to call back into xlog.c during recovery. */ diff --git a/src/include/storage/lwlocklist.h b/src/include/storage/lwlocklist.h index a9681738146..66acb61e065 100644 --- a/src/include/storage/lwlocklist.h +++ b/src/include/storage/lwlocklist.h @@ -84,3 +84,4 @@ PG_LWLOCK(50, DSMRegistry) PG_LWLOCK(51, InjectionPoint) PG_LWLOCK(52, SerialControl) PG_LWLOCK(53, AioWorkerSubmissionQueue) +PG_LWLOCK(54, LastWrittenLsn) From 5cad471ddb7fae8dfc939f610dc716e9697832a8 Mon Sep 17 00:00:00 2001 From: Matthias van de Meent Date: Fri, 27 Jun 2025 16:05:43 +0200 Subject: [PATCH 07/50] Neon: Wrap darwin.h's USE_PREFETCH with ifndef USE_PREFETCH This prevent excessive USE_PREFETCH redefinition warnings if the compiler gets USE_PREFETCH from elsewhere with a different definition value. --- src/include/port/darwin.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/include/port/darwin.h b/src/include/port/darwin.h index 6aa2ea70f6b..f68972d586b 100644 --- a/src/include/port/darwin.h +++ b/src/include/port/darwin.h @@ -10,4 +10,6 @@ /* * macOS has a platform-specific implementation of prefetching. */ +#ifndef USE_PREFETCH #define USE_PREFETCH +#endif \ No newline at end of file From fa0f2bef1c564e322ce39cb72e17d281fcf49848 Mon Sep 17 00:00:00 2001 From: Matthias van de Meent Date: Wed, 31 Jul 2024 14:22:58 +0200 Subject: [PATCH 08/50] NEON: Expose lag tracker, CHECK_FOR_INTERRUPTS, add backpressure --- src/backend/access/transam/xloginsert.c | 12 +++++ src/backend/replication/walsender.c | 52 ++++++++++++++----- src/backend/tcop/postgres.c | 10 ++++ .../utils/activity/wait_event_names.txt | 1 + src/backend/utils/misc/guc_tables.c | 37 +++++++++++++ src/include/access/xloginsert.h | 4 ++ src/include/replication/walsender.h | 25 ++++++++- 7 files changed, 127 insertions(+), 14 deletions(-) diff --git a/src/backend/access/transam/xloginsert.c b/src/backend/access/transam/xloginsert.c index 5ee9d0b028e..679c70f13ae 100644 --- a/src/backend/access/transam/xloginsert.c +++ b/src/backend/access/transam/xloginsert.c @@ -36,9 +36,11 @@ #include "miscadmin.h" #include "pg_trace.h" #include "replication/origin.h" +#include "replication/walsender.h" #include "storage/bufmgr.h" #include "storage/proc.h" #include "utils/memutils.h" +#include "utils/wait_event.h" /* * Guess the maximum buffer size required to store a compressed version of @@ -86,6 +88,11 @@ typedef struct char compressed_page[COMPRESS_BUFSIZE]; } registered_buffer; +/* GUCs */ +int max_replication_apply_lag; +int max_replication_flush_lag; +int max_replication_write_lag; + static registered_buffer *registered_buffers; static int max_registered_buffers; /* allocated size */ static int max_registered_block_id = 0; /* highest block_id + 1 currently @@ -501,6 +508,11 @@ XLogInsert(RmgrId rmid, uint8 info) return EndPos; } + if (delay_backend_us != NULL && delay_backend_us() > 0) + { + InterruptPending = true; + } + do { XLogRecPtr RedoRecPtr; diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c index f2c33250e8b..7c93d92d819 100644 --- a/src/backend/replication/walsender.c +++ b/src/backend/replication/walsender.c @@ -53,6 +53,7 @@ #include "access/transam.h" #include "access/xact.h" #include "access/xlog_internal.h" +#include "access/xloginsert.h" #include "access/xlogreader.h" #include "access/xlogrecovery.h" #include "access/xlogutils.h" @@ -134,6 +135,11 @@ bool log_replication_commands = false; */ bool wake_wal_senders = false; +/* + * Backpressure hook, detecting how much we should delay. + */ +uint64 (*delay_backend_us)(void) = NULL; + /* * xlogreader used for replication. Note that a WAL sender doing physical * replication does not need xlogreader to read WAL, but it needs one to @@ -270,8 +276,6 @@ static void WalSndWriteData(LogicalDecodingContext *ctx, XLogRecPtr lsn, Transac static void WalSndUpdateProgress(LogicalDecodingContext *ctx, XLogRecPtr lsn, TransactionId xid, bool skipped_xact); static XLogRecPtr WalSndWaitForWal(XLogRecPtr loc); -static void LagTrackerWrite(XLogRecPtr lsn, TimestampTz local_flush_time); -static TimeOffset LagTrackerRead(int head, XLogRecPtr lsn, TimestampTz now); static bool TransactionIdInRecentPast(TransactionId xid, uint32 epoch); static void WalSndSegmentOpen(XLogReaderState *state, XLogSegNo nextSegNo, @@ -1795,7 +1799,7 @@ NeedToWaitForWal(XLogRecPtr target_lsn, XLogRecPtr flushed_lsn, * detect a shutdown request (either from postmaster or client) we will return * early, so caller must always check. */ -static XLogRecPtr +XLogRecPtr WalSndWaitForWal(XLogRecPtr loc) { int wakeEvents; @@ -2366,7 +2370,7 @@ ProcessStandbyMessage(void) /* * Remember that a walreceiver just confirmed receipt of lsn `lsn`. */ -static void +void PhysicalConfirmReceivedLocation(XLogRecPtr lsn) { bool changed = false; @@ -2406,21 +2410,34 @@ ProcessStandbyReplyMessage(void) flushPtr, applyPtr; bool replyRequested; - TimeOffset writeLag, - flushLag, - applyLag; - bool clearLagTimes; - TimestampTz now; TimestampTz replyTime; - static bool fullyAppliedLastTime = false; - /* the caller already consumed the msgtype byte */ writePtr = pq_getmsgint64(&reply_message); flushPtr = pq_getmsgint64(&reply_message); applyPtr = pq_getmsgint64(&reply_message); replyTime = pq_getmsgint64(&reply_message); replyRequested = pq_getmsgbyte(&reply_message); + ProcessStandbyReply(writePtr, + flushPtr, + applyPtr, + replyTime, + replyRequested); +} + +void +ProcessStandbyReply(XLogRecPtr writePtr, + XLogRecPtr flushPtr, + XLogRecPtr applyPtr, + TimestampTz replyTime, + bool replyRequested) +{ + TimeOffset writeLag, + flushLag, + applyLag; + bool clearLagTimes; + TimestampTz now; + static bool fullyAppliedLastTime = false; if (message_level_is_interesting(DEBUG2)) { @@ -2603,7 +2620,16 @@ ProcessStandbyHSFeedbackMessage(void) feedbackEpoch = pq_getmsgint(&reply_message, 4); feedbackCatalogXmin = pq_getmsgint(&reply_message, 4); feedbackCatalogEpoch = pq_getmsgint(&reply_message, 4); + ProcessStandbyHSFeedback(replyTime, feedbackXmin, feedbackEpoch, feedbackCatalogXmin, feedbackCatalogEpoch); +} +void +ProcessStandbyHSFeedback(TimestampTz replyTime, + TransactionId feedbackXmin, + uint32 feedbackEpoch, + TransactionId feedbackCatalogXmin, + uint32 feedbackCatalogEpoch) +{ if (message_level_is_interesting(DEBUG2)) { char *replyTimeStr; @@ -4128,7 +4154,7 @@ WalSndKeepaliveIfNecessary(void) * eventually reported to have been written, flushed and applied by the * standby in a reply message. */ -static void +void LagTrackerWrite(XLogRecPtr lsn, TimestampTz local_flush_time) { bool buffer_full; @@ -4193,7 +4219,7 @@ LagTrackerWrite(XLogRecPtr lsn, TimestampTz local_flush_time) * Return -1 if no new sample data is available, and otherwise the elapsed * time in microseconds. */ -static TimeOffset +TimeOffset LagTrackerRead(int head, XLogRecPtr lsn, TimestampTz now) { TimestampTz time = 0; diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c index 2f8c3d5f918..46c3af066c9 100644 --- a/src/backend/tcop/postgres.c +++ b/src/backend/tcop/postgres.c @@ -162,6 +162,8 @@ static volatile sig_atomic_t RecoveryConflictPendingReasons[NUM_PROCSIGNALS]; static MemoryContext row_description_context = NULL; static StringInfoData row_description_buf; +process_interrupts_callback_t ProcessInterruptsCallback; + /* ---------------------------------------------------------------- * decls for routines only used in this file * ---------------------------------------------------------------- @@ -3302,6 +3304,7 @@ ProcessInterrupts(void) return; InterruptPending = false; +retry: if (ProcDiePending) { ProcDiePending = false; @@ -3535,6 +3538,13 @@ ProcessInterrupts(void) if (ParallelApplyMessagePending) ProcessParallelApplyMessages(); + + /* Call registered callback if any */ + if (ProcessInterruptsCallback) + { + if (ProcessInterruptsCallback()) + goto retry; + } } /* diff --git a/src/backend/utils/activity/wait_event_names.txt b/src/backend/utils/activity/wait_event_names.txt index 7123dedb55b..6953239ae3f 100644 --- a/src/backend/utils/activity/wait_event_names.txt +++ b/src/backend/utils/activity/wait_event_names.txt @@ -182,6 +182,7 @@ SPIN_DELAY "Waiting while acquiring a contended spinlock." VACUUM_DELAY "Waiting in a cost-based vacuum delay point." VACUUM_TRUNCATE "Waiting to acquire an exclusive lock to truncate off any empty pages at the end of a table vacuumed." WAL_SUMMARIZER_ERROR "Waiting after a WAL summarizer error." +BACK_PRESSURE "Waiting to relieve backpressure." ABI_compatibility: diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c index d14b1678e7f..7225ae288f3 100644 --- a/src/backend/utils/misc/guc_tables.c +++ b/src/backend/utils/misc/guc_tables.c @@ -35,6 +35,7 @@ #include "access/toast_compression.h" #include "access/twophase.h" #include "access/xlog_internal.h" +#include "access/xloginsert.h" #include "access/xlogprefetcher.h" #include "access/xlogrecovery.h" #include "access/xlogutils.h" @@ -3071,6 +3072,42 @@ struct config_int ConfigureNamesInt[] = NULL, NULL, NULL }, + { + {"max_replication_apply_lag", PGC_POSTMASTER, REPLICATION_SENDING, + gettext_noop("Maximal write lag between master and replicas."), + gettext_noop("When lag between minimal apply position of replica and current LSN exceeds this value," + "backends are blocked."), + GUC_UNIT_MB, + }, + &max_replication_apply_lag, + -1, -1, INT_MAX, /* it should not be smaller than maximal size of WAL record */ + NULL, NULL, NULL + }, + + { + {"max_replication_flush_lag", PGC_POSTMASTER, REPLICATION_SENDING, + gettext_noop("Maximal flush lag between master and replicas."), + gettext_noop("When lag between minimal flush position of replica and current LSN exceeds this value," + "backends are blocked"), + GUC_UNIT_MB, + }, + &max_replication_flush_lag, + -1, -1, INT_MAX, /* it should not be smaller than maximal size of WAL record */ + NULL, NULL, NULL + }, + + { + {"max_replication_write_lag", PGC_POSTMASTER, REPLICATION_SENDING, + gettext_noop("Maximal write lag between master and replicas."), + gettext_noop("When lag between minimal write position of replica and current LSN exceeds this value," + "backends are blocked"), + GUC_UNIT_MB, + }, + &max_replication_write_lag, + -1, -1, INT_MAX, /* it should not be smaller than maximal size of WAL record */ + NULL, NULL, NULL + }, + { {"max_slot_wal_keep_size", PGC_SIGHUP, REPLICATION_SENDING, gettext_noop("Sets the maximum WAL size that can be reserved by replication slots."), diff --git a/src/include/access/xloginsert.h b/src/include/access/xloginsert.h index cf057f033a2..43340653e0f 100644 --- a/src/include/access/xloginsert.h +++ b/src/include/access/xloginsert.h @@ -40,6 +40,10 @@ * is taken */ #define REGBUF_NO_CHANGE 0x20 /* intentionally register clean buffer */ +extern int max_replication_apply_lag; +extern int max_replication_flush_lag; +extern int max_replication_write_lag; + /* prototypes for public functions in xloginsert.c: */ extern void XLogBeginInsert(void); extern void XLogSetRecordFlags(uint8 flags); diff --git a/src/include/replication/walsender.h b/src/include/replication/walsender.h index c3e8e191339..3d2c7f8df2e 100644 --- a/src/include/replication/walsender.h +++ b/src/include/replication/walsender.h @@ -12,7 +12,8 @@ #ifndef _WALSENDER_H #define _WALSENDER_H -#include "access/xlogdefs.h" +#include "access/xlog.h" +#include "signal.h" /* * What to do with a snapshot in create replication slot command. @@ -35,6 +36,9 @@ extern PGDLLIMPORT int max_wal_senders; extern PGDLLIMPORT int wal_sender_timeout; extern PGDLLIMPORT bool log_replication_commands; +struct XLogReaderRoutine; +extern PGDLLIMPORT void (*WalSender_Custom_XLogReaderRoutines)(struct XLogReaderRoutine *xlr); + extern void InitWalSender(void); extern bool exec_replication_command(const char *cmd_string); extern void WalSndErrorCleanup(void); @@ -49,6 +53,25 @@ extern void WalSndWaitStopping(void); extern void HandleWalSndInitStopping(void); extern void WalSndRqstFileReload(void); +/* + * Hook to check for WAL receiving backpressure. + * Return value in microseconds + */ +extern uint64 (*delay_backend_us)(void); + +/* expose these so that they can be reused by the neon walproposer extension */ +extern void LagTrackerWrite(XLogRecPtr lsn, TimestampTz local_flush_time); +extern TimeOffset LagTrackerRead(int head, XLogRecPtr lsn, TimestampTz now); +extern void ProcessStandbyReply(XLogRecPtr writePtr, XLogRecPtr flushPtr, + XLogRecPtr applyPtr, TimestampTz replyTime, + bool replyRequested); +extern void PhysicalConfirmReceivedLocation(XLogRecPtr lsn); +extern void ProcessStandbyHSFeedback(TimestampTz replyTime, + TransactionId feedbackXmin, + uint32 feedbackEpoch, + TransactionId feedbackCatalogXmin, + uint32 feedbackCatalogEpoch); + /* * Remember that we want to wakeup walsenders later * From 37ab94b92be037a55298c078cc6b0d459864af29 Mon Sep 17 00:00:00 2001 From: Matthias van de Meent Date: Wed, 31 Jul 2024 14:52:48 +0200 Subject: [PATCH 09/50] NEON: WalRedo & startup process compatibility We also move certain phases of the CheckPoint process ahead of the final checkpoint record during shutdown, to allow those processes to write out their changes into WAL. --- src/backend/access/gin/ginxlog.c | 25 +- src/backend/access/transam/xlog.c | 248 ++++++++++++++++++-- src/backend/access/transam/xlogreader.c | 125 +++++++--- src/backend/access/transam/xlogrecovery.c | 190 +++++++++++++++- src/backend/access/transam/xlogutils.c | 17 ++ src/backend/replication/walreceiver.c | 7 + src/backend/storage/buffer/buf_init.c | 8 + src/backend/storage/buffer/bufmgr.c | 41 +++- src/backend/storage/buffer/localbuf.c | 15 +- src/backend/utils/misc/guc_tables.c | 9 + src/bin/pg_waldump/pg_waldump.c | 264 +++++++++++++++++++--- src/include/access/xlog.h | 16 ++ src/include/access/xlogreader.h | 7 + src/include/access/xlogrecovery.h | 3 + src/include/access/xlogutils.h | 6 + src/include/miscadmin.h | 7 + src/include/storage/buf_internals.h | 2 + src/include/storage/bufmgr.h | 5 +- 18 files changed, 888 insertions(+), 107 deletions(-) diff --git a/src/backend/access/gin/ginxlog.c b/src/backend/access/gin/ginxlog.c index 55a1ec09776..f6b87103691 100644 --- a/src/backend/access/gin/ginxlog.c +++ b/src/backend/access/gin/ginxlog.c @@ -407,6 +407,7 @@ ginRedoSplit(XLogReaderState *record) rootbuf; bool isLeaf = (data->flags & GIN_INSERT_ISLEAF) != 0; bool isRoot = (data->flags & GIN_SPLIT_ROOT) != 0; + XLogRedoAction action; /* * First clear incomplete-split flag on child page if this finishes a @@ -415,21 +416,27 @@ ginRedoSplit(XLogReaderState *record) if (!isLeaf) ginRedoClearIncompleteSplit(record, 3); - if (XLogReadBufferForRedo(record, 0, &lbuffer) != BLK_RESTORED) + action = XLogReadBufferForRedo(record, 0, &lbuffer); + if (action != BLK_RESTORED && action != BLK_DONE) elog(ERROR, "GIN split record did not contain a full-page image of left page"); - if (XLogReadBufferForRedo(record, 1, &rbuffer) != BLK_RESTORED) + action = XLogReadBufferForRedo(record, 1, &rbuffer); + if (action != BLK_RESTORED && action != BLK_DONE) elog(ERROR, "GIN split record did not contain a full-page image of right page"); if (isRoot) { - if (XLogReadBufferForRedo(record, 2, &rootbuf) != BLK_RESTORED) + action = XLogReadBufferForRedo(record, 2, &rootbuf); + if (action != BLK_RESTORED && action != BLK_DONE) elog(ERROR, "GIN split record did not contain a full-page image of root page"); - UnlockReleaseBuffer(rootbuf); + if (rootbuf != InvalidBuffer) + UnlockReleaseBuffer(rootbuf); } - UnlockReleaseBuffer(rbuffer); - UnlockReleaseBuffer(lbuffer); + if (rbuffer != InvalidBuffer) + UnlockReleaseBuffer(rbuffer); + if (lbuffer != InvalidBuffer) + UnlockReleaseBuffer(lbuffer); } /* @@ -443,9 +450,11 @@ ginRedoVacuumPage(XLogReaderState *record) if (XLogReadBufferForRedo(record, 0, &buffer) != BLK_RESTORED) { - elog(ERROR, "replay of gin entry tree page vacuum did not restore the page"); + /* NEON: we do not not apply WAL record if target page is absent at replica */ + elog(DEBUG2, "replay of gin entry tree page vacuum did not restore the page"); } - UnlockReleaseBuffer(buffer); + if (BufferIsValid(buffer)) + UnlockReleaseBuffer(buffer); } static void diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 3fe774ed157..79050b7043f 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -146,6 +146,9 @@ bool XLOG_DEBUG = false; int wal_segment_size = DEFAULT_XLOG_SEG_SIZE; +/* NEON: Hook to allow the neon extension to restore running-xacts from CLOG at replica startup */ +restore_running_xacts_callback_t restore_running_xacts_callback = NULL; + /* NEON: Hooks to facilitate the last written LSN cache */ set_lwlsn_block_hook_type set_lwlsn_block_hook = NULL; set_lwlsn_block_range_hook_type set_lwlsn_block_range_hook = NULL; @@ -213,7 +216,6 @@ const struct config_enum_entry archive_mode_options[] = { {NULL, 0, false} }; - /* * Statistics for current checkpoint are collected in this global struct. * Because only the checkpointer or a stand-alone backend can perform @@ -682,6 +684,15 @@ static bool holdingAllLocks = false; static MemoryContext walDebugCxt = NULL; #endif + +/* + * Variables read from 'neon.signal' file. + */ +bool NeonRecoveryRequested = false; +XLogRecPtr neonLastRec = InvalidXLogRecPtr; +bool neonWriteOk = false; + + static void CleanupAfterArchiveRecovery(TimeLineID EndOfLogTLI, XLogRecPtr EndOfLog, TimeLineID newTLI); @@ -692,6 +703,7 @@ static void CreateEndOfRecoveryRecord(void); static XLogRecPtr CreateOverwriteContrecordRecord(XLogRecPtr aborted_lsn, XLogRecPtr pagePtr, TimeLineID newTLI); +static void PreCheckPointGuts(int flags); static void CheckPointGuts(XLogRecPtr checkPointRedo, int flags); static void KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo); static XLogRecPtr XLogGetReplicationSlotMinimumLSN(void); @@ -742,6 +754,7 @@ static void WALInsertLockAcquireExclusive(void); static void WALInsertLockRelease(void); static void WALInsertLockUpdateInsertingAt(XLogRecPtr insertingAt); + /* * Insert an XLOG record represented by an already-constructed chain of data * chunks. This is a low-level routine; to construct the WAL record header @@ -3715,11 +3728,15 @@ InstallXLogFileSegment(XLogSegNo *segno, char *tmppath, XLogFilePath(path, tli, *segno, wal_segment_size); - LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); - if (!XLogCtl->InstallXLogFileSegmentActive) - { - LWLockRelease(ControlFileLock); - return false; + if (XLogCtl) + { + /* Neon: in case of sync-safekeepers shared memory is not inialized */ + LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); + if (!XLogCtl->InstallXLogFileSegmentActive) + { + LWLockRelease(ControlFileLock); + return false; + } } if (!find_free) @@ -3735,7 +3752,8 @@ InstallXLogFileSegment(XLogSegNo *segno, char *tmppath, if ((*segno) >= max_segno) { /* Failed to find a free slot within specified range */ - LWLockRelease(ControlFileLock); + if (XLogCtl) + LWLockRelease(ControlFileLock); return false; } (*segno)++; @@ -3746,12 +3764,14 @@ InstallXLogFileSegment(XLogSegNo *segno, char *tmppath, Assert(access(path, F_OK) != 0 && errno == ENOENT); if (durable_rename(tmppath, path, LOG) != 0) { - LWLockRelease(ControlFileLock); + if (XLogCtl) + LWLockRelease(ControlFileLock); /* durable_rename already emitted log message */ return false; } - LWLockRelease(ControlFileLock); + if (XLogCtl) + LWLockRelease(ControlFileLock); return true; } @@ -5620,6 +5640,81 @@ CheckRequiredParameterValues(void) } } +static void +readNeonSignalFile(void) +{ + int fd; + + fd = BasicOpenFile(NEON_SIGNAL_FILE, O_RDONLY | PG_BINARY); + if (fd >= 0) + { + struct stat statbuf; + char *content; + char prev_lsn_str[20]; + + /* Slurp the file into a string */ + if (stat(NEON_SIGNAL_FILE, &statbuf) != 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not stat file \"%s\": %m", + NEON_SIGNAL_FILE))); + content = palloc(statbuf.st_size + 1); + if (read(fd, content, statbuf.st_size) != statbuf.st_size) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read file \"%s\": %m", + NEON_SIGNAL_FILE))); + content[statbuf.st_size] = '\0'; + + /* Parse it */ + if (sscanf(content, "PREV LSN: %19s", prev_lsn_str) != 1) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("invalid data in file \"%s\"", NEON_SIGNAL_FILE))); + + if (strcmp(prev_lsn_str, "invalid") == 0) + { + /* No prev LSN. Forbid starting up in read-write mode */ + neonLastRec = InvalidXLogRecPtr; + neonWriteOk = false; + } + else if (strcmp(prev_lsn_str, "none") == 0) + { + /* + * The page server had no valid prev LSN, but assured that it's ok + * to start without it. This happens when you start the compute + * node for the first time on a new branch. + */ + neonLastRec = InvalidXLogRecPtr; + neonWriteOk = true; + } + else + { + uint32 hi, + lo; + + if (sscanf(prev_lsn_str, "%X/%X", &hi, &lo) != 2) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("invalid data in file \"%s\"", NEON_SIGNAL_FILE))); + neonLastRec = ((uint64) hi) << 32 | lo; + + /* If prev LSN is given, it better be valid */ + if (neonLastRec == InvalidXLogRecPtr) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("invalid prev-LSN in file \"%s\"", NEON_SIGNAL_FILE))); + neonWriteOk = true; + } + NeonRecoveryRequested = true; + close(fd); + + elog(LOG, + "[NEON] found \"%s\" file. setting prev LSN to %X/%X", + NEON_SIGNAL_FILE, LSN_FORMAT_ARGS(neonLastRec)); + } +} + /* * This must be called ONCE during postmaster or standalone-backend startup */ @@ -5651,10 +5746,15 @@ StartupXLOG(void) CurrentResourceOwner == AuxProcessResourceOwner); CurrentResourceOwner = AuxProcessResourceOwner; + /* + * Read neon.signal before anything else. + */ + readNeonSignalFile(); + /* * Check that contents look valid. */ - if (!XRecOffIsValid(ControlFile->checkPoint)) + if (!XRecOffIsValid(ControlFile->checkPoint) && !NeonRecoveryRequested) ereport(FATAL, (errcode(ERRCODE_DATA_CORRUPTED), errmsg("control file contains invalid checkpoint location"))); @@ -5984,8 +6084,9 @@ StartupXLOG(void) */ if (ArchiveRecoveryRequested && EnableHotStandby) { - TransactionId *xids; + TransactionId *xids = NULL; int nxids; + bool apply_running_xacts = false; ereport(DEBUG1, (errmsg_internal("initializing for hot standby"))); @@ -6013,14 +6114,33 @@ StartupXLOG(void) * nothing was running on the primary at this point. So fake-up an * empty running-xacts record and use that here and now. Recover * additional standby state for prepared transactions. + * + * Neon: We also use a similar mechanism to start up sooner at + * replica startup, by scanning the CLOG and faking a running-xacts + * record based on that. */ if (wasShutdown) + { + /* Update pg_subtrans entries for any prepared transactions */ + StandbyRecoverPreparedTransactions(); + apply_running_xacts = true; + } + else if (restore_running_xacts_callback) + { + /* + * Update pg_subtrans entries for any prepared transactions before + * calling the extension hook. + */ + StandbyRecoverPreparedTransactions(); + apply_running_xacts = restore_running_xacts_callback(&checkPoint, &xids, &nxids); + } + + if (apply_running_xacts) { RunningTransactionsData running; TransactionId latestCompletedXid; - /* Update pg_subtrans entries for any prepared transactions */ - StandbyRecoverPreparedTransactions(); + /* Neon: called StandbyRecoverPreparedTransactions() above already */ /* * Construct a RunningTransactions snapshot representing a @@ -7151,6 +7271,11 @@ CreateCheckPoint(int flags) */ SyncPreCheckpoint(); + /* + * NEON: perform checkpoint action requiring write to the WAL before we determine the REDO pointer. + */ + PreCheckPointGuts(flags); + /* * Use a critical section to force system panic if we have trouble. */ @@ -7724,6 +7849,32 @@ CreateOverwriteContrecordRecord(XLogRecPtr aborted_lsn, XLogRecPtr pagePtr, return recptr; } +static void +CheckPointReplicationState(int flags) +{ + CheckPointRelationMap(); + CheckPointReplicationSlots((flags & CHECKPOINT_IS_SHUTDOWN) != 0); + CheckPointSnapBuild(); + CheckPointLogicalRewriteHeap(); + CheckPointReplicationOrigin(); +} + +/* + * NEON: we use logical records to persist information of about slots, origins, relation map... + * If it is done inside shutdown checkpoint, then Postgres panics: "concurrent write-ahead log activity while database system is shutting down" + * So do it before checkpoint REDO position is determined. + * The same is true for CheckPointBuffers which wallog dirty FSM/VM pages. + */ +static void +PreCheckPointGuts(int flags) +{ + if (flags & (CHECKPOINT_IS_SHUTDOWN|CHECKPOINT_END_OF_RECOVERY)) + { + CheckPointReplicationState(flags); + CheckPointBuffers(flags); + } +} + /* * Flush all data in shared memory to disk, and fsync * @@ -7733,11 +7884,9 @@ CreateOverwriteContrecordRecord(XLogRecPtr aborted_lsn, XLogRecPtr pagePtr, static void CheckPointGuts(XLogRecPtr checkPointRedo, int flags) { - CheckPointRelationMap(); - CheckPointReplicationSlots(flags & CHECKPOINT_IS_SHUTDOWN); - CheckPointSnapBuild(); - CheckPointLogicalRewriteHeap(); - CheckPointReplicationOrigin(); + /* NEON: In shutdown checkpoints we've already checkpointed the replication state */ + if (!(flags & (CHECKPOINT_IS_SHUTDOWN|CHECKPOINT_END_OF_RECOVERY))) + CheckPointReplicationState(flags); /* Write out all dirty data in SLRUs and the main buffer pool */ TRACE_POSTGRESQL_BUFFER_CHECKPOINT_START(flags); @@ -7747,7 +7896,14 @@ CheckPointGuts(XLogRecPtr checkPointRedo, int flags) CheckPointSUBTRANS(); CheckPointMultiXact(); CheckPointPredicate(); - CheckPointBuffers(flags); + /* + * NEON: Checkpoint buffer will write dirty pages to the disk and Neon SMGR + * wallog FSM/VM pages to persist them at page server. + * Writing to the WAL during shutdown checkpoint cause Postgres panic. + * So do it before in PreCheckPointGuts. + */ + if (!(flags & (CHECKPOINT_IS_SHUTDOWN|CHECKPOINT_END_OF_RECOVERY))) + CheckPointBuffers(flags); /* Perform all queued up fsyncs */ TRACE_POSTGRESQL_BUFFER_CHECKPOINT_SYNC_START(); @@ -7912,6 +8068,14 @@ CreateRestartPoint(int flags) /* Update the process title */ update_checkpoint_display(flags, true, false); + /* + * NEON: CheckPointGuts expects certain operations to have happened + * before, if the checkpoint is a shutdown checkpoint (indicated as + * (flags & CHECKPOINT_IS_SHUTDOWN) != 0). However, we don't care + * much about shutdown checkpoints (we always rebuild the whole data + * directory) so we don't care much about certain data not having + * been synced - we'll get new and consistent data at the next start. + */ CheckPointGuts(lastCheckPoint.redo, flags); /* @@ -8703,6 +8867,7 @@ xlog_redo(XLogReaderState *record) for (uint8 block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++) { Buffer buffer; + XLogRedoAction result; if (!XLogRecHasBlockImage(record, block_id)) { @@ -8711,9 +8876,23 @@ xlog_redo(XLogReaderState *record) continue; } - if (XLogReadBufferForRedo(record, block_id, &buffer) != BLK_RESTORED) + result = XLogReadBufferForRedo(record, block_id, &buffer); + + if (result == BLK_DONE && (!IsUnderPostmaster || StandbyMode)) + { + /* + * NEON: In the special WAL redo process, blocks that are being + * ignored return BLK_DONE. Accept that. + * Additionally, in standby mode, blocks that are not present + * in shared buffers are ignored during replay, so we also + * ignore those blocks. + */ + } + else if (result != BLK_RESTORED) elog(ERROR, "unexpected XLogReadBufferForRedo result when restoring backup block"); - UnlockReleaseBuffer(buffer); + + if (buffer != InvalidBuffer) + UnlockReleaseBuffer(buffer); } } else if (info == XLOG_BACKUP_END) @@ -9725,3 +9904,30 @@ SetWalWriterSleeping(bool sleeping) XLogCtl->WalWriterSleeping = sleeping; SpinLockRelease(&XLogCtl->info_lck); } + +void +XLogUpdateWalBuffers(char* data, XLogRecPtr start, size_t len) +{ + XLogRecPtr end; + int idx; + XLogRecPtr pagebegptr; + + end = start + len; + Assert(len + (start & (XLOG_BLCKSZ - 1)) <= XLOG_BLCKSZ); + + WALInsertLockAcquireExclusive(); + idx = XLogRecPtrToBufIdx(end); + pagebegptr = pg_atomic_read_u64(&XLogCtl->xlblocks[idx]) - XLOG_BLCKSZ; + + if (pagebegptr + XLOG_BLCKSZ >= end && pagebegptr < end) + { + /* Last page of the segment is present in WAL buffers */ + char* page = &XLogCtl->pages[idx * XLOG_BLCKSZ]; + size_t overlap = end - pagebegptr; + if (overlap <= len) + memcpy(page, data + len - overlap, overlap); + else + memcpy(page + overlap - len, data, len); + } + WALInsertLockRelease(); +} diff --git a/src/backend/access/transam/xlogreader.c b/src/backend/access/transam/xlogreader.c index 5c26d33a603..72044678f0d 100644 --- a/src/backend/access/transam/xlogreader.c +++ b/src/backend/access/transam/xlogreader.c @@ -231,7 +231,7 @@ WALOpenSegmentInit(WALOpenSegment *seg, WALSegmentContext *segcxt, void XLogBeginRead(XLogReaderState *state, XLogRecPtr RecPtr) { - Assert(!XLogRecPtrIsInvalid(RecPtr)); + Assert(!XLogRecPtrIsInvalid(RecPtr) || state->skip_lsn_checks); ResetDecoder(state); @@ -255,6 +255,14 @@ XLogReleasePreviousRecord(XLogReaderState *state) if (!state->record) return InvalidXLogRecPtr; +#define SKIP_INVALID_RECORD(rec_ptr) do { \ + rec_ptr = MAXALIGN(rec_ptr + 1); \ + if (rec_ptr % XLOG_BLCKSZ <= MAXALIGN(1)) \ + goto restart; \ + else \ + goto skip_invalid; \ + } while (0); + /* * Remove it from the decoded record queue. It must be the oldest item * decoded, decode_queue_head. @@ -435,7 +443,7 @@ XLogReadRecord(XLogReaderState *state, char **errormsg) * Return NULL if there is no space in the decode buffer and allow_oversized * is false, or if memory allocation fails for an oversized buffer. */ -static DecodedXLogRecord * +DecodedXLogRecord * XLogReadRecordAlloc(XLogReaderState *state, size_t xl_tot_len, bool allow_oversized) { size_t required_space = DecodeXLogRecordRequiredSpace(xl_tot_len); @@ -578,7 +586,7 @@ XLogDecodeNextRecord(XLogReaderState *state, bool nonblocking) * valid record starting position or alternatively to the beginning of * a page. See the header comments for XLogBeginRead. */ - Assert(RecPtr % XLOG_BLCKSZ == 0 || XRecOffIsValid(RecPtr)); + Assert(RecPtr % XLOG_BLCKSZ == 0 || XRecOffIsValid(RecPtr) || state->skip_lsn_checks); randAccess = true; } @@ -617,18 +625,24 @@ XLogDecodeNextRecord(XLogReaderState *state, bool nonblocking) } else if (targetRecOff < pageHeaderSize) { - report_invalid_record(state, "invalid record offset at %X/%X: expected at least %u, got %u", - LSN_FORMAT_ARGS(RecPtr), - pageHeaderSize, targetRecOff); - goto err; + if(!state->skip_page_validation) + { + report_invalid_record(state, "invalid record offset at %X/%X: expected at least %u, got %u", + LSN_FORMAT_ARGS(RecPtr), + pageHeaderSize, targetRecOff); + goto err; + } } if ((((XLogPageHeader) state->readBuf)->xlp_info & XLP_FIRST_IS_CONTRECORD) && targetRecOff == pageHeaderSize) { - report_invalid_record(state, "contrecord is requested by %X/%X", - LSN_FORMAT_ARGS(RecPtr)); - goto err; + if(!state->skip_page_validation) + { + report_invalid_record(state, "contrecord is requested by %X/%X", + LSN_FORMAT_ARGS(RecPtr)); + goto err; + } } /* ReadPageInternal has verified the page header */ @@ -643,6 +657,7 @@ XLogDecodeNextRecord(XLogReaderState *state, bool nonblocking) * cannot access any other fields until we've verified that we got the * whole header. */ +skip_invalid: record = (XLogRecord *) (state->readBuf + RecPtr % XLOG_BLCKSZ); total_len = record->xl_tot_len; @@ -658,7 +673,13 @@ XLogDecodeNextRecord(XLogReaderState *state, bool nonblocking) { if (!ValidXLogRecordHeader(state, RecPtr, state->DecodeRecPtr, record, randAccess)) - goto err; + { + if(!state->skip_invalid_records) + goto err; + + SKIP_INVALID_RECORD(RecPtr); + } + gotheader = true; } else @@ -666,11 +687,16 @@ XLogDecodeNextRecord(XLogReaderState *state, bool nonblocking) /* There may be no next page if it's too small. */ if (total_len < SizeOfXLogRecord) { - report_invalid_record(state, - "invalid record length at %X/%X: expected at least %u, got %u", - LSN_FORMAT_ARGS(RecPtr), - (uint32) SizeOfXLogRecord, total_len); - goto err; + if(!state->skip_invalid_records) + { + report_invalid_record(state, + "invalid record length at %X/%X: expected at least %u, got %u", + LSN_FORMAT_ARGS(RecPtr), + (uint32) SizeOfXLogRecord, total_len); + goto err; + } + + SKIP_INVALID_RECORD(RecPtr); } /* We'll validate the header once we have the next page. */ gotheader = false; @@ -756,10 +782,15 @@ XLogDecodeNextRecord(XLogReaderState *state, bool nonblocking) /* Check that the continuation on next page looks valid */ if (!(pageHeader->xlp_info & XLP_FIRST_IS_CONTRECORD)) { - report_invalid_record(state, - "there is no contrecord flag at %X/%X", - LSN_FORMAT_ARGS(RecPtr)); - goto err; + if(!state->skip_invalid_records) + { + report_invalid_record(state, + "there is no contrecord flag at %X/%X", + LSN_FORMAT_ARGS(RecPtr)); + goto err; + } + + SKIP_INVALID_RECORD(RecPtr); } /* @@ -769,12 +800,17 @@ XLogDecodeNextRecord(XLogReaderState *state, bool nonblocking) if (pageHeader->xlp_rem_len == 0 || total_len != (pageHeader->xlp_rem_len + gotlen)) { - report_invalid_record(state, - "invalid contrecord length %u (expected %lld) at %X/%X", - pageHeader->xlp_rem_len, - ((long long) total_len) - gotlen, - LSN_FORMAT_ARGS(RecPtr)); - goto err; + if(!state->skip_invalid_records) + { + report_invalid_record(state, + "invalid contrecord length %u (expected %lld) at %X/%X", + pageHeader->xlp_rem_len, + ((long long) total_len) - gotlen, + LSN_FORMAT_ARGS(RecPtr)); + goto err; + } + + SKIP_INVALID_RECORD(RecPtr); } /* Wait for the next page to become available */ @@ -814,7 +850,12 @@ XLogDecodeNextRecord(XLogReaderState *state, bool nonblocking) record = (XLogRecord *) state->readRecordBuf; if (!ValidXLogRecordHeader(state, RecPtr, state->DecodeRecPtr, record, randAccess)) - goto err; + { + if(!state->skip_invalid_records) + goto err; + + SKIP_INVALID_RECORD(RecPtr); + } gotheader = true; } @@ -844,7 +885,12 @@ XLogDecodeNextRecord(XLogReaderState *state, bool nonblocking) record = (XLogRecord *) state->readRecordBuf; if (!ValidXLogRecord(state, record, RecPtr)) - goto err; + { + if(!state->skip_invalid_records) + goto err; + + SKIP_INVALID_RECORD(RecPtr); + } pageHeaderSize = XLogPageHeaderSize((XLogPageHeader) state->readBuf); state->DecodeRecPtr = RecPtr; @@ -863,7 +909,12 @@ XLogDecodeNextRecord(XLogReaderState *state, bool nonblocking) /* Record does not cross a page boundary */ if (!ValidXLogRecord(state, record, RecPtr)) - goto err; + { + if(!state->skip_invalid_records) + goto err; + + SKIP_INVALID_RECORD(RecPtr); + } state->NextRecPtr = RecPtr + MAXALIGN(total_len); @@ -1060,7 +1111,8 @@ ReadPageInternal(XLogReaderState *state, XLogRecPtr pageptr, int reqLen) Assert(readLen == XLOG_BLCKSZ); if (!XLogReaderValidatePageHeader(state, targetSegmentPtr, - state->readBuf)) + state->readBuf) && + !state->skip_page_validation) goto err; } @@ -1101,7 +1153,8 @@ ReadPageInternal(XLogReaderState *state, XLogRecPtr pageptr, int reqLen) /* * Now that we know we have the full header, validate it. */ - if (!XLogReaderValidatePageHeader(state, pageptr, (char *) hdr)) + if (!XLogReaderValidatePageHeader(state, pageptr, (char *) hdr) && + !state->skip_page_validation) goto err; /* update read state information */ @@ -1160,7 +1213,7 @@ ValidXLogRecordHeader(XLogReaderState *state, XLogRecPtr RecPtr, * We can't exactly verify the prev-link, but surely it should be less * than the record's own address. */ - if (!(record->xl_prev < RecPtr)) + if (!(record->xl_prev < RecPtr) && !state->skip_lsn_checks) { report_invalid_record(state, "record with incorrect prev-link %X/%X at %X/%X", @@ -1176,7 +1229,7 @@ ValidXLogRecordHeader(XLogReaderState *state, XLogRecPtr RecPtr, * check guards against torn WAL pages where a stale but valid-looking * WAL record starts on a sector boundary. */ - if (record->xl_prev != PrevRecPtr) + if (record->xl_prev != PrevRecPtr && !state->skip_lsn_checks) { report_invalid_record(state, "record with incorrect prev-link %X/%X at %X/%X", @@ -1321,7 +1374,7 @@ XLogReaderValidatePageHeader(XLogReaderState *state, XLogRecPtr recptr, * check typically fails when an old WAL segment is recycled, and hasn't * yet been overwritten with new data yet. */ - if (hdr->xlp_pageaddr != recptr) + if (hdr->xlp_pageaddr != recptr && !state->skip_lsn_checks) { char fname[MAXFNAMELEN]; @@ -1830,7 +1883,9 @@ DecodeXLogRecord(XLogReaderState *state, if ((blk->bimg_info & BKPIMAGE_HAS_HOLE) && (blk->hole_offset == 0 || blk->hole_length == 0 || - blk->bimg_len == BLCKSZ)) + blk->bimg_len == BLCKSZ) && + !(blk->hole_offset == 0 && + blk->hole_length == BLCKSZ)) /* null page */ { report_invalid_record(state, "BKPIMAGE_HAS_HOLE set, but hole offset %u length %u block image length %u at %X/%X", diff --git a/src/backend/access/transam/xlogrecovery.c b/src/backend/access/transam/xlogrecovery.c index 6ce979f2d8b..c34d0a6dfc6 100644 --- a/src/backend/access/transam/xlogrecovery.c +++ b/src/backend/access/transam/xlogrecovery.c @@ -344,6 +344,7 @@ typedef struct XLogRecoveryCtlData XLogRecPtr lastReplayedReadRecPtr; /* start position */ XLogRecPtr lastReplayedEndRecPtr; /* end+1 position */ TimeLineID lastReplayedTLI; /* timeline */ + ConditionVariable replayProgressCV; /* CV for waiters */ /* * When we're currently replaying a record, ie. in a redo function, @@ -473,6 +474,7 @@ XLogRecoveryShmemInit(void) SpinLockInit(&XLogRecoveryCtl->info_lck); InitSharedLatch(&XLogRecoveryCtl->recoveryWakeupLatch); + ConditionVariableInit(&XLogRecoveryCtl->replayProgressCV); ConditionVariableInit(&XLogRecoveryCtl->recoveryNotPausedCV); } @@ -494,6 +496,64 @@ EnableStandbyMode(void) disable_startup_progress_timeout(); } +/* + * Wait for recovery to complete replaying all WAL up to and including + * redoEndRecPtr. + * + * This gets woken up for every WAL record replayed, so make sure you're not + * trying to wait an LSN that is too far in the future. + */ +void +XLogWaitForReplayOf(XLogRecPtr redoEndRecPtr) +{ + static XLogRecPtr replayRecPtr = 0; + + if (!RecoveryInProgress()) + return; + + /* + * Check the backend-local variable first, we may be able to skip accessing + * shared memory (which requires locking) + */ + if (redoEndRecPtr <= replayRecPtr) + return; + + replayRecPtr = GetXLogReplayRecPtr(NULL); + + /* + * Check again if we're going to need to wait, now that we've updated + * the local cached variable. + */ + if (redoEndRecPtr <= replayRecPtr) + return; + + /* + * We need to wait for the variable, so prepare for that. + * + * Note: This wakes up every time a WAL record is replayed, so this can + * be expensive. + */ + ConditionVariablePrepareToSleep(&XLogRecoveryCtl->replayProgressCV); + + while (redoEndRecPtr > replayRecPtr) + { + bool timeout; + timeout = ConditionVariableTimedSleep(&XLogRecoveryCtl->replayProgressCV, + 10000, /* 10 seconds, in millis */ + WAIT_EVENT_RECOVERY_WAL_STREAM); + + replayRecPtr = GetXLogReplayRecPtr(NULL); + + if (timeout) + ereport(LOG, + (errmsg("Waiting for recovery to catch up to %X/%X (currently %X/%X)", + LSN_FORMAT_ARGS(redoEndRecPtr), + LSN_FORMAT_ARGS(replayRecPtr)))); + } + + ConditionVariableCancelSleep(); +} + /* * Prepare the system for WAL recovery, if needed. * @@ -707,6 +767,56 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr, /* tell the caller to delete it later */ haveBackupLabel = true; } + else if (NeonRecoveryRequested) + { + /* + * NEON: hacks to spawn compute node without WAL. Pretend that we + * just finished reading the record that started at 'neonLastRec' + * and ended at checkpoint.redo + */ + elog(LOG, "starting with neon basebackup at LSN %X/%X, prev %X/%X", + LSN_FORMAT_ARGS(ControlFile->checkPointCopy.redo), + LSN_FORMAT_ARGS(neonLastRec)); + + CheckPointLoc = neonLastRec; + CheckPointTLI = ControlFile->checkPointCopy.ThisTimeLineID; + RedoStartLSN = ControlFile->checkPointCopy.redo; + // FIXME needs review. rebase of ff41b709abea6a9c42100a4fcb0ff434b2c846c9 + // Is it still relevant? + /* make basebackup LSN available for walproposer */ + SetRedoStartLsn(RedoStartLSN); + //EndRecPtr = ControlFile->checkPointCopy.redo; + + memcpy(&checkPoint, &ControlFile->checkPointCopy, sizeof(CheckPoint)); + + /* + * When a primary Neon compute node is started, we pretend that it + * started after a clean shutdown and no recovery is needed. We don't + * need to do WAL replay to make the database consistent, the page + * server does that on a page-by-page basis. + * + * When starting a read-only replica, we will also start from a + * consistent snapshot of the cluster the start LSN, so we don't need + * to perform WAL replay to get there. However, wasShutdown must be + * set to false, because wasShutdown==true would imply that there are + * no transactions still in-progress at the start LSN, and we would + * initialize the known-assigned XIDs machinery for hot standby + * incorrectly. If this is a static read-only node that doesn't follow + * the primary though, then it's OK, as we will never see the possible + * commits of the in-progress transactions. + */ + if (StandbyModeRequested && + PrimaryConnInfo != NULL && *PrimaryConnInfo != '\0') + { + wasShutdown = false; + } + else + wasShutdown = true; + + /* Initialize expectedTLEs, like ReadRecord() does */ + expectedTLEs = readTimeLineHistory(checkPoint.ThisTimeLineID); + XLogPrefetcherBeginRead(xlogprefetcher, ControlFile->checkPointCopy.redo); + } else { /* No backup_label file has been found if we are here. */ @@ -780,6 +890,7 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr, CheckPointLoc = ControlFile->checkPoint; CheckPointTLI = ControlFile->checkPointCopy.ThisTimeLineID; RedoStartLSN = ControlFile->checkPointCopy.redo; + SetRedoStartLsn(RedoStartLSN); RedoStartTLI = ControlFile->checkPointCopy.ThisTimeLineID; record = ReadCheckpointRecord(xlogprefetcher, CheckPointLoc, CheckPointTLI); @@ -829,6 +940,9 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr, else if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE) ereport(LOG, (errmsg("starting point-in-time recovery to earliest consistent point"))); + else if (NeonRecoveryRequested) + ereport(LOG, + (errmsg("starting neon recovery"))); else ereport(LOG, (errmsg("starting archive recovery"))); @@ -901,7 +1015,7 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr, (errmsg("invalid next transaction ID"))); /* sanity check */ - if (checkPoint.redo > CheckPointLoc) + if (checkPoint.redo > CheckPointLoc && !NeonRecoveryRequested) ereport(PANIC, (errmsg("invalid redo in checkpoint record"))); @@ -1527,8 +1641,13 @@ FinishWalRecovery(void) lastRec = XLogRecoveryCtl->lastReplayedReadRecPtr; lastRecTLI = XLogRecoveryCtl->lastReplayedTLI; } - XLogPrefetcherBeginRead(xlogprefetcher, lastRec); - (void) ReadRecord(xlogprefetcher, PANIC, false, lastRecTLI); + + if (!NeonRecoveryRequested) + { + XLogPrefetcherBeginRead(xlogprefetcher, lastRec); + (void) ReadRecord(xlogprefetcher, PANIC, false, lastRecTLI); + } + endOfLog = xlogreader->EndRecPtr; /* @@ -1562,11 +1681,56 @@ FinishWalRecovery(void) } } + /* + * When starting from a neon base backup, we don't have WAL. Initialize + * the WAL page where we will start writing new records from scratch, + * instead. + */ + if (NeonRecoveryRequested) + { + if (!neonWriteOk) + { + /* + * We cannot start generating new WAL if we don't have a valid prev-LSN + * to use for the first new WAL record. (Shouldn't happen.) + */ + ereport(ERROR, + (errmsg("cannot start in read-write mode from this base backup"))); + } + else + { + int offs = endOfLog % XLOG_BLCKSZ; + char *page = palloc0(offs); + XLogRecPtr pageBeginPtr = endOfLog - offs; + int lastPageSize = ((pageBeginPtr % wal_segment_size) == 0) ? SizeOfXLogLongPHD : SizeOfXLogShortPHD; + + XLogPageHeader xlogPageHdr = (XLogPageHeader) (page); + + xlogPageHdr->xlp_pageaddr = pageBeginPtr; + xlogPageHdr->xlp_magic = XLOG_PAGE_MAGIC; + xlogPageHdr->xlp_tli = recoveryTargetTLI; + /* + * If we start writing with offset from page beginning, pretend in + * page header there is a record ending where actual data will + * start. + */ + xlogPageHdr->xlp_rem_len = offs - lastPageSize; + xlogPageHdr->xlp_info = (xlogPageHdr->xlp_rem_len > 0) ? XLP_FIRST_IS_CONTRECORD : 0; + readOff = XLogSegmentOffset(pageBeginPtr, wal_segment_size); + + result->lastPageBeginPtr = pageBeginPtr; + result->lastPage = page; + elog(LOG, "Continue writing WAL at %X/%X", LSN_FORMAT_ARGS(xlogreader->EndRecPtr)); + + // FIXME: should we unlink neon.signal? + } + } + /* * Copy the last partial block to the caller, for initializing the WAL * buffer for appending new WAL. */ - if (endOfLog % XLOG_BLCKSZ != 0) + else if (endOfLog % XLOG_BLCKSZ != 0) { char *page; int len; @@ -1618,7 +1782,10 @@ ShutdownWalRecovery(void) char recoveryPath[MAXPGPATH]; /* Final update of pg_stat_recovery_prefetch. */ - XLogPrefetcherComputeStats(xlogprefetcher); + if (!NeonRecoveryRequested) + { + XLogPrefetcherComputeStats(xlogprefetcher); + } /* Shut down xlogreader */ if (readFile >= 0) @@ -1627,7 +1794,11 @@ ShutdownWalRecovery(void) readFile = -1; } XLogReaderFree(xlogreader); - XLogPrefetcherFree(xlogprefetcher); + + if (!NeonRecoveryRequested) + { + XLogPrefetcherFree(xlogprefetcher); + } if (ArchiveRecoveryRequested) { @@ -1728,7 +1899,10 @@ PerformWalRecovery(void) else { /* just have to read next record after CheckPoint */ - Assert(xlogreader->ReadRecPtr == CheckPointLoc); + if (NeonRecoveryRequested) + xlogreader->ReadRecPtr = CheckPointLoc; + else + Assert(xlogreader->ReadRecPtr == CheckPointLoc); replayTLI = CheckPointTLI; record = ReadRecord(xlogprefetcher, LOG, false, replayTLI); } @@ -2071,6 +2245,8 @@ ApplyWalRecord(XLogReaderState *xlogreader, XLogRecord *record, TimeLineID *repl /* Reset the prefetcher. */ XLogPrefetchReconfigure(); } + + ConditionVariableBroadcast(&XLogRecoveryCtl->replayProgressCV); } /* diff --git a/src/backend/access/transam/xlogutils.c b/src/backend/access/transam/xlogutils.c index c389b27f77d..d2b6c4545d0 100644 --- a/src/backend/access/transam/xlogutils.c +++ b/src/backend/access/transam/xlogutils.c @@ -30,6 +30,8 @@ #include "utils/rel.h" +bool (*redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id); + /* GUC variable */ bool ignore_invalid_pages = false; @@ -359,6 +361,21 @@ XLogReadBufferForRedoExtended(XLogReaderState *record, block_id); } + if (redo_read_buffer_filter && redo_read_buffer_filter(record, block_id)) + { + if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK) + { + *buf = ReadBufferWithoutRelcache(rlocator, forknum, + blkno, mode, NULL, true); + return BLK_DONE; + } + else + { + *buf = InvalidBuffer; + return BLK_DONE; + } + } + /* * Make sure that if the block is marked with WILL_INIT, the caller is * going to initialize it. And vice versa. diff --git a/src/backend/replication/walreceiver.c b/src/backend/replication/walreceiver.c index 8c4d0fd9aed..3f64e01f7d6 100644 --- a/src/backend/replication/walreceiver.c +++ b/src/backend/replication/walreceiver.c @@ -494,6 +494,13 @@ WalReceiverMain(const void *startup_data, size_t startup_data_len) if (endofwal) break; + /* + * Update WAL statistics, which are produced inside + * issue_xlog_fsync function. This is useful for counting + * WAL flushes, by querying pg_stat_wal. + */ + pgstat_report_wal(true); + /* Find the soonest wakeup time, to limit our nap. */ nextWakeup = TIMESTAMP_INFINITY; for (int i = 0; i < NUM_WALRCV_WAKEUPS; ++i) diff --git a/src/backend/storage/buffer/buf_init.c b/src/backend/storage/buffer/buf_init.c index ed1dc488a42..0ffe5b7a3d8 100644 --- a/src/backend/storage/buffer/buf_init.c +++ b/src/backend/storage/buffer/buf_init.c @@ -24,6 +24,14 @@ ConditionVariableMinimallyPadded *BufferIOCVArray; WritebackContext BackendWritebackContext; CkptSortItem *CkptBufferIds; +/* + * Buffer with target WAL redo page. + * We must not evict this page from the buffer pool, but we cannot just keep it pinned because + * some WAL redo functions expect the page to not be pinned. So we have a special check in + * localbuf.c to prevent this buffer from being evicted. + */ +Buffer wal_redo_buffer; +bool am_wal_redo_postgres = false; /* * Data Structures: diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 667aa0c0c78..384677bf5ef 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -179,6 +179,10 @@ int checkpoint_flush_after = DEFAULT_CHECKPOINT_FLUSH_AFTER; int bgwriter_flush_after = DEFAULT_BGWRITER_FLUSH_AFTER; int backend_flush_after = DEFAULT_BACKEND_FLUSH_AFTER; +/* Evict unpinned pages (for better test coverage) */ +bool neon_test_evict = false; + + /* local state for LockBufferForCleanup */ static BufferDesc *PinCountWaitBuf = NULL; @@ -1032,7 +1036,13 @@ ZeroAndLockBuffer(Buffer buffer, ReadBufferMode mode, bool already_valid) { BufferDesc *bufHdr; bool need_to_zero; - bool isLocalBuf = BufferIsLocal(buffer); + /* + * wal_redo postgres is working in single user mode, we do not need to + * synchronize access to shared buffer, so let's use local buffers instead. + * Note that this implies that when am_wal_redo_postgres is set, we always + * use local buffers. + */ + bool isLocalBuf = BufferIsLocal(buffer) || am_wal_redo_postgres; Assert(mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK); @@ -1046,6 +1056,7 @@ ZeroAndLockBuffer(Buffer buffer, ReadBufferMode mode, bool already_valid) } else if (isLocalBuf) { + Assert(BufferIsLocal(buffer)); /* Simple case for non-shared buffers. */ bufHdr = GetLocalBufferDescriptor(-buffer - 1); need_to_zero = StartLocalBufferIO(bufHdr, true, false); @@ -3325,6 +3336,34 @@ UnpinBufferNoOwner(BufferDesc *buf) WakePinCountWaiter(buf); ForgetPrivateRefCountEntry(ref); + + if (neon_test_evict && !InRecovery) + { + buf_state = LockBufHdr(buf); + if ((buf_state & BM_VALID) && BUF_STATE_GET_REFCOUNT(buf_state) == 0) + { + ReservePrivateRefCountEntry(); + PinBuffer_Locked(buf); + if (buf_state & BM_DIRTY) + { + if (LWLockConditionalAcquire(BufferDescriptorGetContentLock(buf), + LW_SHARED)) + { + FlushOneBuffer(b); + LWLockRelease(BufferDescriptorGetContentLock(buf)); + } + } + + /* + * Try to invalidate the page. This can fail if the page is + * concurrently modified; we'll let it be in that case + */ + (void) InvalidateVictimBuffer(buf); + UnpinBuffer(buf); + } + else + UnlockBufHdr(buf, buf_state); + } } } diff --git a/src/backend/storage/buffer/localbuf.c b/src/backend/storage/buffer/localbuf.c index ba26627f7b0..a32fe09138b 100644 --- a/src/backend/storage/buffer/localbuf.c +++ b/src/backend/storage/buffer/localbuf.c @@ -17,7 +17,9 @@ #include "access/parallel.h" #include "executor/instrument.h" +#include "miscadmin.h" #include "pgstat.h" +#include "../../../../../../pg_install/v17/include/postgresql/server/c.h" #include "storage/aio.h" #include "storage/buf_internals.h" #include "storage/bufmgr.h" @@ -25,8 +27,11 @@ #include "utils/guc_hooks.h" #include "utils/memdebug.h" #include "utils/memutils.h" +#include "utils/rel.h" #include "utils/resowner.h" +/* NEON: prevent eviction of the buffer of target page */ +extern Buffer wal_redo_buffer; /*#define LBDEBUG*/ @@ -244,7 +249,15 @@ GetLocalVictimBuffer(void) if (LocalRefCount[victim_bufid] == 0) { - uint32 buf_state = pg_atomic_read_u32(&bufHdr->state); + uint32 buf_state; + + if (-victim_bufid - 1 == wal_redo_buffer) + { + /* NEON: Prevent eviction of the buffer with target wal redo page */ + continue; + } + + buf_state = pg_atomic_read_u32(&bufHdr->state); if (BUF_STATE_GET_USAGECOUNT(buf_state) > 0) { diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c index 7225ae288f3..07283180592 100644 --- a/src/backend/utils/misc/guc_tables.c +++ b/src/backend/utils/misc/guc_tables.c @@ -2144,6 +2144,15 @@ struct config_bool ConfigureNamesBool[] = NULL, NULL, NULL }, + { + {"neon_test_evict", PGC_POSTMASTER, UNGROUPED, + gettext_noop("Evict unpinned pages (for better test coverage)"), + }, + &neon_test_evict, + false, + NULL, NULL, NULL + }, + /* End-of-list marker */ { {NULL, 0, 0, NULL, NULL}, NULL, false, NULL, NULL, NULL diff --git a/src/bin/pg_waldump/pg_waldump.c b/src/bin/pg_waldump/pg_waldump.c index 51fb76efc48..ba5541be988 100644 --- a/src/bin/pg_waldump/pg_waldump.c +++ b/src/bin/pg_waldump/pg_waldump.c @@ -29,9 +29,12 @@ #include "common/logging.h" #include "common/relpath.h" #include "getopt_long.h" +#include "port/pg_bitutils.h" #include "rmgrdesc.h" #include "storage/bufpage.h" +#define OFFSET_INVALID ((size_t)-1) + /* * NOTE: For any code change or issue fix here, it is highly recommended to * give a thought about doing the same in pg_walinspect contrib module as well. @@ -50,6 +53,7 @@ typedef struct XLogDumpPrivate XLogRecPtr startptr; XLogRecPtr endptr; bool endptr_reached; + char* input_filename; } XLogDumpPrivate; typedef struct XLogDumpConfig @@ -63,6 +67,8 @@ typedef struct XLogDumpConfig bool stats; bool stats_per_record; + bool ignore_format_errors; + /* filter options */ bool filter_by_rmgr[RM_MAX_ID + 1]; bool filter_by_rmgr_enabled; @@ -94,6 +100,34 @@ sigint_handler(SIGNAL_ARGS) } #endif +/* calculate ceil(log base 2) of num */ +static int +my_log2(long num) +{ + /* + * guard against too-large input, which would be invalid for + * pg_ceil_log2_*() + */ + if (num > LONG_MAX / 2) + num = LONG_MAX / 2; + +#if SIZEOF_LONG < 8 + return pg_ceil_log2_32(num); +#else + return pg_ceil_log2_64(num); +#endif +} + +/* calculate first power of 2 >= num, bounded to what will fit in an int */ +static int +next_pow2_int(long num) +{ + if (num > INT_MAX / 2) + num = INT_MAX / 2; + return 1 << my_log2(num); +} + + static void print_rmgr_list(void) { @@ -207,7 +241,7 @@ open_file_in_directory(const char *directory, const char *fname) * wal segment size. */ static bool -search_directory(const char *directory, const char *fname) +search_directory(const char *directory, const char *fname, bool ignore_format_errors) { int fd = -1; DIR *xldir; @@ -251,14 +285,36 @@ search_directory(const char *directory, const char *fname) WalSegSz = longhdr->xlp_seg_size; + // if we skip errors, we don't need to check the segment size if (!IsValidWalSegSize(WalSegSz)) { - pg_log_error(ngettext("invalid WAL segment size in WAL file \"%s\" (%d byte)", - "invalid WAL segment size in WAL file \"%s\" (%d bytes)", - WalSegSz), - fname, WalSegSz); - pg_log_error_detail("The WAL segment size must be a power of two between 1 MB and 1 GB."); - exit(1); + if (!ignore_format_errors) + { + pg_log_error(ngettext("invalid WAL segment size in WAL file \"%s\" (%d byte)", + "invalid WAL segment size in WAL file \"%s\" (%d bytes)", + WalSegSz), + fname, WalSegSz); + pg_log_error_detail("The WAL segment size must be a power of two between 1 MB and 1 GB."); + exit(1); + } + else + { + struct stat stat; + if(fstat(fd, &stat) != 0) + pg_fatal("could not stat file \"%s\"", fname); + + WalSegSz = stat.st_size; + + // if file size is invalid, the xlogreader will fail later with some obscure error + // so better to fail here + if (!IsValidWalSegSize(WalSegSz)) + { + pg_fatal(ngettext("invalid WAL segment size in WAL file \"%s\" (%d byte)", + "invalid WAL segment size in WAL file \"%s\" (%d bytes)", + WalSegSz), + fname, WalSegSz); + } + } } } else if (r < 0) @@ -289,18 +345,18 @@ search_directory(const char *directory, const char *fname) * The valid target directory is returned. */ static char * -identify_target_directory(char *directory, char *fname) +identify_target_directory(char *directory, char *fname, bool ignore_format_errors) { char fpath[MAXPGPATH]; if (directory != NULL) { - if (search_directory(directory, fname)) + if (search_directory(directory, fname, ignore_format_errors)) return pg_strdup(directory); /* directory / XLOGDIR */ snprintf(fpath, MAXPGPATH, "%s/%s", directory, XLOGDIR); - if (search_directory(fpath, fname)) + if (search_directory(fpath, fname, ignore_format_errors)) return pg_strdup(fpath); } else @@ -308,10 +364,10 @@ identify_target_directory(char *directory, char *fname) const char *datadir; /* current directory */ - if (search_directory(".", fname)) + if (search_directory(".", fname, ignore_format_errors)) return pg_strdup("."); /* XLOGDIR */ - if (search_directory(XLOGDIR, fname)) + if (search_directory(XLOGDIR, fname, ignore_format_errors)) return pg_strdup(XLOGDIR); datadir = getenv("PGDATA"); @@ -319,7 +375,7 @@ identify_target_directory(char *directory, char *fname) if (datadir != NULL) { snprintf(fpath, MAXPGPATH, "%s/%s", datadir, XLOGDIR); - if (search_directory(fpath, fname)) + if (search_directory(fpath, fname, ignore_format_errors)) return pg_strdup(fpath); } } @@ -341,6 +397,18 @@ WALDumpOpenSegment(XLogReaderState *state, XLogSegNo nextSegNo, TimeLineID tli = *tli_p; char fname[MAXPGPATH]; int tries; + XLogDumpPrivate *private = state->private_data; + + if(private->input_filename) + { + Assert(nextSegNo == 0); + + state->seg.ws_file = open_file_in_directory(state->segcxt.ws_dir, private->input_filename); + if (state->seg.ws_file >= 0) + return; + + pg_fatal("could not open file \"%s\": %m", private->input_filename); + } XLogFileName(fname, tli, nextSegNo, state->segcxt.ws_segsize); @@ -411,6 +479,7 @@ WALDumpReadPage(XLogReaderState *state, XLogRecPtr targetPagePtr, int reqLen, { WALOpenSegment *seg = &errinfo.wre_seg; char fname[MAXPGPATH]; + char *actual_fname = private->input_filename ? private->input_filename : fname; XLogFileName(fname, seg->ws_tli, seg->ws_segno, state->segcxt.ws_segsize); @@ -419,11 +488,11 @@ WALDumpReadPage(XLogReaderState *state, XLogRecPtr targetPagePtr, int reqLen, { errno = errinfo.wre_errno; pg_fatal("could not read from file \"%s\", offset %d: %m", - fname, errinfo.wre_off); + actual_fname, errinfo.wre_off); } else pg_fatal("could not read from file \"%s\", offset %d: read %d of %d", - fname, errinfo.wre_off, errinfo.wre_read, + actual_fname, errinfo.wre_off, errinfo.wre_read, errinfo.wre_req); } @@ -499,7 +568,7 @@ XLogRecordSaveFPWs(XLogReaderState *record, const char *savepath) char forkname[FORKNAMECHARS + 2]; /* _ + terminating zero */ FILE *file; BlockNumber blk; - RelFileLocator rnode; + RelFileLocator rlocator; ForkNumber fork; if (!XLogRecHasBlockRef(record, block_id)) @@ -515,7 +584,7 @@ XLogRecordSaveFPWs(XLogReaderState *record, const char *savepath) pg_fatal("%s", record->errormsg_buf); (void) XLogRecGetBlockTagExtended(record, block_id, - &rnode, &fork, &blk, NULL); + &rlocator, &fork, &blk, NULL); if (fork >= 0 && fork <= MAX_FORKNUM) sprintf(forkname, "_%s", forkNames[fork]); @@ -525,7 +594,7 @@ XLogRecordSaveFPWs(XLogReaderState *record, const char *savepath) snprintf(filename, MAXPGPATH, "%s/%08X-%08X-%08X.%u.%u.%u.%u%s", savepath, record->seg.ws_tli, LSN_FORMAT_ARGS(record->ReadRecPtr), - rnode.spcOid, rnode.dbOid, rnode.relNumber, blk, forkname); + rlocator.spcOid, rlocator.dbOid, rlocator.relNumber, blk, forkname); file = fopen(filename, PG_BINARY_W); if (!file) @@ -551,16 +620,26 @@ XLogDumpDisplayRecord(XLogDumpConfig *config, XLogReaderState *record) uint32 fpi_len; uint8 info = XLogRecGetInfo(record); XLogRecPtr xl_prev = XLogRecGetPrev(record); + XLogDumpPrivate *private = record->private_data; StringInfoData s; XLogRecGetLen(record, &rec_len, &fpi_len); - printf("rmgr: %-11s len (rec/tot): %6u/%6u, tx: %10u, lsn: %X/%08X, prev %X/%08X, ", - desc->rm_name, - rec_len, XLogRecGetTotalLen(record), - XLogRecGetXid(record), + if(private->input_filename) + printf("rmgr: %-11s len (rec/tot): %6u/%6u, tx: %10u, offset: %X/%08X, prev %X/%08X, ", + desc->rm_name, + rec_len, XLogRecGetTotalLen(record), + XLogRecGetXid(record), LSN_FORMAT_ARGS(record->ReadRecPtr), - LSN_FORMAT_ARGS(xl_prev)); + LSN_FORMAT_ARGS(xl_prev)); + else + printf("rmgr: %-11s len (rec/tot): %6u/%6u, tx: %10u, lsn: %X/%08X, prev %X/%08X, ", + desc->rm_name, + rec_len, XLogRecGetTotalLen(record), + XLogRecGetXid(record), + LSN_FORMAT_ARGS(record->ReadRecPtr), + LSN_FORMAT_ARGS(xl_prev)); + id = desc->rm_identify(info); if (id == NULL) @@ -766,7 +845,10 @@ usage(void) printf(_(" -f, --follow keep retrying after reaching end of WAL\n")); printf(_(" -F, --fork=FORK only show records that modify blocks in fork FORK;\n" " valid names are main, fsm, vm, init\n")); + printf(_(" -i, --ignore ignore format errors, skip invalid structures\n")); + printf(_(" -N, --file=FNAME dump log records from a single file\n")); printf(_(" -n, --limit=N number of records to display\n")); + printf(_(" -o, --offset=OFFSET offset of the first record to in a file to dump\n")); printf(_(" -p, --path=PATH directory in which to find WAL segment files or a\n" " directory with a ./pg_wal that contains such files\n" " (default: current directory, ./pg_wal, $PGDATA/pg_wal)\n")); @@ -801,6 +883,9 @@ main(int argc, char **argv) XLogRecPtr first_record; char *waldir = NULL; char *errormsg; + char *fName = NULL; + bool single_file = false; + size_t start_offset = OFFSET_INVALID; static struct option long_options[] = { {"bkp-details", no_argument, NULL, 'b'}, @@ -808,6 +893,9 @@ main(int argc, char **argv) {"end", required_argument, NULL, 'e'}, {"follow", no_argument, NULL, 'f'}, {"fork", required_argument, NULL, 'F'}, + {"file", required_argument, NULL, 'N'}, + {"ignore", no_argument, NULL, 'i'}, + {"offset", required_argument, NULL, 'o'}, {"fullpage", no_argument, NULL, 'w'}, {"help", no_argument, NULL, '?'}, {"limit", required_argument, NULL, 'n'}, @@ -857,6 +945,7 @@ main(int argc, char **argv) private.startptr = InvalidXLogRecPtr; private.endptr = InvalidXLogRecPtr; private.endptr_reached = false; + private.input_filename = NULL; config.quiet = false; config.bkp_details = false; @@ -875,6 +964,7 @@ main(int argc, char **argv) config.save_fullpage_path = NULL; config.stats = false; config.stats_per_record = false; + config.ignore_format_errors = false; stats.startptr = InvalidXLogRecPtr; stats.endptr = InvalidXLogRecPtr; @@ -885,7 +975,7 @@ main(int argc, char **argv) goto bad_argument; } - while ((option = getopt_long(argc, argv, "bB:e:fF:n:p:qr:R:s:t:wx:z", + while ((option = getopt_long(argc, argv, "bB:e:fF:in:N:o:p:qr:R:s:t:wx:z", long_options, &optindex)) != -1) { switch (option) @@ -924,6 +1014,13 @@ main(int argc, char **argv) } config.filter_by_extended = true; break; + case 'N': + fName = pg_strdup(optarg); + single_file = true; + break; + case 'i': + config.ignore_format_errors = true; + break; case 'n': if (sscanf(optarg, "%d", &config.stop_after_records) != 1) { @@ -931,6 +1028,13 @@ main(int argc, char **argv) goto bad_argument; } break; + case 'o': + if (sscanf(optarg, "%zu", &start_offset) != 1) + { + pg_log_error("could not parse offset \"%s\"", optarg); + goto bad_argument; + } + break; case 'p': waldir = pg_strdup(optarg); break; @@ -1096,6 +1200,73 @@ main(int argc, char **argv) goto bad_argument; } + if (start_offset != OFFSET_INVALID) + { + if(!XLogRecPtrIsInvalid(private.startptr) || !XLogRecPtrIsInvalid(private.endptr)) + { + pg_log_error("either file offset or start/end pointers should be specified"); + goto bad_argument; + } + + if(!single_file) + { + pg_log_error("offset option could only be used with filename option"); + goto bad_argument; + } + + /* Log records are maxaligned, start at the closest next position */ + private.startptr = MAXALIGN(start_offset); + } + + if(single_file) + { + char *directory = NULL; + int fd; + struct stat stat; + + if(config.follow) + { + pg_log_error("Follow could not be used in file dump mode"); + goto bad_argument; + } + + if (waldir != NULL) + { + pg_log_error("either single file or wal directory should be specified"); + goto bad_argument; + } + + split_path(fName, &directory, &private.input_filename); + waldir = directory; + + if(waldir == NULL) + { + char *cwd = malloc(MAXPGPATH); + + if (!getcwd(cwd, MAXPGPATH)) + pg_fatal("could identify current directory: %m"); + + waldir = cwd; + } + + if (!verify_directory(waldir)) + pg_fatal("could not open directory \"%s\": %m", waldir); + + fd = open_file_in_directory(waldir, private.input_filename); + if (fd < 0) + pg_fatal("could not open file \"%s\"", private.input_filename); + + if(fstat(fd, &stat) != 0) + pg_fatal("could not stat file \"%s\"", private.input_filename); + + private.endptr = stat.st_size; + + /* Round up segment size to next power of 2 or 1MB */ + WalSegSz = Max(next_pow2_int(private.endptr), 1024 * 1024); + + close(fd); + } + if (waldir != NULL) { /* validate path points to directory */ @@ -1117,6 +1288,12 @@ main(int argc, char **argv) int fd; XLogSegNo segno; + if(single_file) + { + pg_log_error("either single file or start/end boundaries should be specified"); + goto bad_argument; + } + split_path(argv[optind], &directory, &fname); if (waldir == NULL && directory != NULL) @@ -1127,7 +1304,7 @@ main(int argc, char **argv) pg_fatal("could not open directory \"%s\": %m", waldir); } - waldir = identify_target_directory(waldir, fname); + waldir = identify_target_directory(waldir, fname, config.ignore_format_errors); fd = open_file_in_directory(waldir, fname); if (fd < 0) pg_fatal("could not open file \"%s\"", fname); @@ -1189,10 +1366,11 @@ main(int argc, char **argv) } } else - waldir = identify_target_directory(waldir, NULL); + if (!single_file) + waldir = identify_target_directory(waldir, NULL, config.ignore_format_errors); /* we don't know what to print */ - if (XLogRecPtrIsInvalid(private.startptr)) + if (XLogRecPtrIsInvalid(private.startptr) && !single_file) { pg_log_error("no start WAL location given"); goto bad_argument; @@ -1210,12 +1388,34 @@ main(int argc, char **argv) if (!xlogreader_state) pg_fatal("out of memory while allocating a WAL reading processor"); - /* first find a valid recptr to start from */ - first_record = XLogFindNextRecord(xlogreader_state, private.startptr); + if(single_file) + { + if(config.ignore_format_errors) + { + xlogreader_state->skip_page_validation = true; + xlogreader_state->skip_invalid_records = true; + } - if (first_record == InvalidXLogRecPtr) - pg_fatal("could not find a valid record after %X/%X", - LSN_FORMAT_ARGS(private.startptr)); + xlogreader_state->skip_lsn_checks = true; + first_record = private.startptr; + XLogBeginRead(xlogreader_state, first_record); + } + else + { + if(config.ignore_format_errors) + { + xlogreader_state->skip_page_validation = true; + xlogreader_state->skip_invalid_records = true; + xlogreader_state->skip_lsn_checks = true; + } + + /* first find a valid recptr to start from */ + first_record = XLogFindNextRecord(xlogreader_state, private.startptr); + + if (first_record == InvalidXLogRecPtr) + pg_fatal("could not find a valid record after %X/%X", + LSN_FORMAT_ARGS(private.startptr)); + } /* * Display a message that we're skipping data if `from` wasn't a pointer diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h index 5a5d7b37ac1..57036c508c0 100644 --- a/src/include/access/xlog.h +++ b/src/include/access/xlog.h @@ -13,6 +13,7 @@ #include "access/xlogbackup.h" #include "access/xlogdefs.h" +#include "catalog/pg_control.h" #include "datatype/timestamp.h" #include "lib/stringinfo.h" #include "nodes/pg_list.h" @@ -40,6 +41,9 @@ extern PGDLLIMPORT XLogRecPtr XactLastCommitEnd; */ #define REL_METADATA_PSEUDO_BLOCKNO InvalidBlockNumber +extern bool NeonRecoveryRequested; +extern XLogRecPtr neonLastRec; +extern bool neonWriteOk; /* these variables are GUC parameters related to XLOG */ extern PGDLLIMPORT int wal_segment_size; @@ -269,6 +273,9 @@ extern XLogRecPtr GetLastImportantRecPtr(void); extern void SetRedoStartLsn(XLogRecPtr RedoStartLSN); extern XLogRecPtr GetRedoStartLsn(void); +extern void SetRedoStartLsn(XLogRecPtr RedoStartLSN); +extern XLogRecPtr GetRedoStartLsn(void); + extern void SetWalWriterSleeping(bool sleeping); extern Size WALReadFromBuffers(char *dstbuf, XLogRecPtr startptr, Size count, @@ -328,6 +335,13 @@ extern void do_pg_abort_backup(int code, Datum arg); extern void register_persistent_abort_backup_handler(void); extern SessionBackupState get_backup_status(void); +/* + * NEON: Hook to allow the neon extension to restore running-xacts from CLOG + * at replica startup without having to wait for a running-xacts record. + */ +typedef bool (*restore_running_xacts_callback_t) (CheckPoint *checkpoint, TransactionId **xids, int *nxids); +extern restore_running_xacts_callback_t restore_running_xacts_callback; + /* File path names (all relative to $PGDATA) */ #define RECOVERY_SIGNAL_FILE "recovery.signal" #define STANDBY_SIGNAL_FILE "standby.signal" @@ -337,6 +351,8 @@ extern SessionBackupState get_backup_status(void); #define TABLESPACE_MAP "tablespace_map" #define TABLESPACE_MAP_OLD "tablespace_map.old" +#define NEON_SIGNAL_FILE "neon.signal" + /* files to signal promotion to primary */ #define PROMOTE_SIGNAL_FILE "promote" diff --git a/src/include/access/xlogreader.h b/src/include/access/xlogreader.h index 9738462d3c9..d10dd359f56 100644 --- a/src/include/access/xlogreader.h +++ b/src/include/access/xlogreader.h @@ -216,6 +216,10 @@ struct XLogReaderState /* Set when XLP_FIRST_IS_OVERWRITE_CONTRECORD is found */ XLogRecPtr overwrittenRecPtr; + /* Disable validation to allow dumping corrupt WAL */ + bool skip_page_validation; + bool skip_invalid_records; + bool skip_lsn_checks; /* ---------------------------------------- * Decoded representation of current record @@ -441,4 +445,7 @@ extern bool XLogRecGetBlockTagExtended(XLogReaderState *record, uint8 block_id, BlockNumber *blknum, Buffer *prefetch_buffer); +extern DecodedXLogRecord * +XLogReadRecordAlloc(XLogReaderState *state, size_t xl_tot_len, bool allow_oversized); + #endif /* XLOGREADER_H */ diff --git a/src/include/access/xlogrecovery.h b/src/include/access/xlogrecovery.h index 91446303024..6edef021f74 100644 --- a/src/include/access/xlogrecovery.h +++ b/src/include/access/xlogrecovery.h @@ -136,6 +136,7 @@ extern void ShutdownWalRecovery(void); extern void RemovePromoteSignalFiles(void); extern bool HotStandbyActive(void); +extern void XLogWaitForReplayOf(XLogRecPtr redoEndRecPtr); extern XLogRecPtr GetXLogReplayRecPtr(TimeLineID *replayTLI); extern RecoveryPauseState GetRecoveryPauseState(void); extern void SetRecoveryPause(bool recoveryPause); @@ -155,4 +156,6 @@ extern void RecoveryRequiresIntParameter(const char *param_name, int currValue, extern void xlog_outdesc(StringInfo buf, XLogReaderState *record); +extern void XLogUpdateWalBuffers(char* data, XLogRecPtr start, size_t len); + #endif /* XLOGRECOVERY_H */ diff --git a/src/include/access/xlogutils.h b/src/include/access/xlogutils.h index a1870d8e5aa..5c9eb8e9bf0 100644 --- a/src/include/access/xlogutils.h +++ b/src/include/access/xlogutils.h @@ -84,6 +84,12 @@ typedef struct ReadLocalXLogPageNoWaitPrivate bool end_of_wal; /* true, when end of WAL is reached */ } ReadLocalXLogPageNoWaitPrivate; +/* + * Returns true if we shouldn't do REDO on that block in record indicated by + * block_id; false otherwise. + */ +extern bool (*redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id); + extern XLogRedoAction XLogReadBufferForRedo(XLogReaderState *record, uint8 block_id, Buffer *buf); extern Buffer XLogInitBufferForRedo(XLogReaderState *record, uint8 block_id); diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h index 1bef98471c3..e788c7aecdc 100644 --- a/src/include/miscadmin.h +++ b/src/include/miscadmin.h @@ -108,6 +108,10 @@ extern PGDLLIMPORT volatile uint32 CritSectionCount; /* in tcop/postgres.c */ extern void ProcessInterrupts(void); +/* Callback called by ProcessInterrupts in the loop while it is returning true. */ +typedef bool (*process_interrupts_callback_t)(void); +extern process_interrupts_callback_t ProcessInterruptsCallback; + /* Test whether an interrupt is pending */ #ifndef WIN32 #define INTERRUPTS_PENDING_CONDITION() \ @@ -539,4 +543,7 @@ extern void RestoreClientConnectionInfo(char *conninfo); /* in executor/nodeHash.c */ extern size_t get_hash_memory_limit(void); +/* in storage/buffer/buf_init.c */ +extern bool am_wal_redo_postgres; + #endif /* MISCADMIN_H */ diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h index 0dec7d93b3b..51b031921d2 100644 --- a/src/include/storage/buf_internals.h +++ b/src/include/storage/buf_internals.h @@ -326,6 +326,8 @@ extern PGDLLIMPORT BufferDescPadded *BufferDescriptors; extern PGDLLIMPORT ConditionVariableMinimallyPadded *BufferIOCVArray; extern PGDLLIMPORT WritebackContext BackendWritebackContext; +extern Buffer wal_redo_buffer; + /* in localbuf.c */ extern PGDLLIMPORT BufferDesc *LocalBufferDescriptors; diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h index 41fdc1e7693..804adf77a82 100644 --- a/src/include/storage/bufmgr.h +++ b/src/include/storage/bufmgr.h @@ -175,6 +175,7 @@ extern PGDLLIMPORT int bgwriter_flush_after; extern PGDLLIMPORT const PgAioHandleCallbacks aio_shared_buffer_readv_cb; extern PGDLLIMPORT const PgAioHandleCallbacks aio_local_buffer_readv_cb; +extern PGDLLIMPORT bool neon_test_evict; /* in buf_init.c */ extern PGDLLIMPORT char *BufferBlocks; @@ -184,8 +185,8 @@ extern PGDLLIMPORT int NLocBuffer; extern PGDLLIMPORT Block *LocalBufferBlockPointers; extern PGDLLIMPORT int32 *LocalRefCount; -/* upper limit for effective_io_concurrency */ -#define MAX_IO_CONCURRENCY 1000 +/* upper limit for effective_io_concurrency (power of 2 for convenience) */ +#define MAX_IO_CONCURRENCY 1024 /* special block number for ReadBuffer() */ #define P_NEW InvalidBlockNumber /* grow the file to get a new page */ From 39a71d59d4d6cc645502829f18d250fdd307a95f Mon Sep 17 00:00:00 2001 From: Matthias van de Meent Date: Wed, 31 Jul 2024 17:33:57 +0200 Subject: [PATCH 10/50] NEON: Add prefetching to various scan types Specifically, hash vacuum & btree index (only) scans Co-authored-by: Konstantin Knizhnik --- src/backend/access/hash/hash.c | 16 ++ src/backend/access/nbtree/README | 44 +++++ src/backend/access/nbtree/nbtinsert.c | 2 +- src/backend/access/nbtree/nbtree.c | 2 + src/backend/access/nbtree/nbtsearch.c | 234 ++++++++++++++++++++++++- src/backend/optimizer/path/costsize.c | 2 + src/backend/storage/buffer/bufmgr.c | 1 + src/backend/utils/misc/guc_tables.c | 20 +++ src/include/access/nbtree.h | 19 +- src/include/optimizer/cost.h | 3 + src/test/regress/expected/sysviews.out | 4 +- 11 files changed, 342 insertions(+), 5 deletions(-) diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c index 53061c819fb..d9f980aaa5b 100644 --- a/src/backend/access/hash/hash.c +++ b/src/backend/access/hash/hash.c @@ -33,6 +33,7 @@ #include "utils/fmgrprotos.h" #include "utils/index_selfuncs.h" #include "utils/rel.h" +#include "utils/spccache.h" /* Working state for hashbuild and its callback */ typedef struct @@ -472,13 +473,17 @@ hashbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, Bucket orig_maxbucket; Bucket cur_maxbucket; Bucket cur_bucket; + Bucket prf_bucket; Buffer metabuf = InvalidBuffer; HashMetaPage metap; HashMetaPage cachedmetap; + int io_concurrency; tuples_removed = 0; num_index_tuples = 0; + io_concurrency = get_tablespace_maintenance_io_concurrency(rel->rd_rel->reltablespace); + /* * We need a copy of the metapage so that we can use its hashm_spares[] * values to compute bucket page addresses, but a cached copy should be @@ -493,9 +498,14 @@ hashbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, /* Scan the buckets that we know exist */ cur_bucket = 0; + prf_bucket = cur_bucket; cur_maxbucket = orig_maxbucket; loop_top: + for (; prf_bucket <= cur_maxbucket && + prf_bucket < cur_bucket + io_concurrency; prf_bucket++) + PrefetchBuffer(rel, MAIN_FORKNUM, BUCKET_TO_BLKNO(cachedmetap, prf_bucket)); + while (cur_bucket <= cur_maxbucket) { BlockNumber bucket_blkno; @@ -506,6 +516,12 @@ hashbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, Page page; bool split_cleanup = false; + if (io_concurrency > 0 && prf_bucket <= cur_maxbucket) + { + PrefetchBuffer(rel, MAIN_FORKNUM, BUCKET_TO_BLKNO(cachedmetap, prf_bucket)); + prf_bucket++; + } + /* Get address of bucket's start page */ bucket_blkno = BUCKET_TO_BLKNO(cachedmetap, cur_bucket); diff --git a/src/backend/access/nbtree/README b/src/backend/access/nbtree/README index 53d4a61dc3f..90119b14bf4 100644 --- a/src/backend/access/nbtree/README +++ b/src/backend/access/nbtree/README @@ -1081,3 +1081,47 @@ item is irrelevant, and need not be stored at all. This arrangement corresponds to the fact that an L&Y non-leaf page has one more pointer than key. Suffix truncation's negative infinity attributes behave in the same way. + +Notes About Index Scan Prefetch +------------------------------- + +Prefetch can significantly improve the speed of OLAP queries. +To be able to perform prefetch, we need to know which pages will +be accessed during the scan. It is trivial for heap- and bitmap scans, +but requires more effort for index scans: to implement prefetch for +index scans, we need to find out subsequent leaf pages. + +Postgres links all pages at the same level of the B-Tree in a doubly linked list and uses this list for +forward and backward iteration. This list, however, can not trivially be used for prefetching because to locate the next page because we need first to load the current page. To prefetch more than only the next page, we can utilize the parent page's downlinks instead, as it contains references to most of the target page's sibling pages. + +Because Postgres' nbtree pages have no reference to their parent page, we need to remember the parent page when descending the btree and use it to prefetch subsequent pages. We will utilize the parent's linked list to improve the performance of this prefetch system past the key range of the parent page. + +We should prefetch not only leaf pages, but also the next parent page. +The trick is to correctly calculate the moment when it will be needed: +We should not issue the prefetch request when prefetch requests for all children from the current parent page have already been issued, but when there are only effective_io_concurrency line pointers left to prefetch from the page. + +Currently there are two different prefetch implementations for +index-only scan and index scan. Index-only scan doesn't need to access heap tuples so it prefetches +only B-Tree leave pages (and their parents). Prefetch of index-only scan is performed only +if parallel plan is not used. Parallel index scan is using critical section for obtaining next +page by parallel worker. Leaf page is loaded in this critical section. +And if most of time is spent in loading the page, then it actually eliminates any concurrency +and makes prefetch useless. For relatively small tables Postgres will not choose parallel plan in +any case. And for large tables it can be enforced by setting max_parallel_workers_per_gather=0. + +Prefetch for normal (not index-only) index tries to prefetch heap tuples +referenced from leaf page. Average number of items per page +is about 100 which is comparable with default value of effective_io_concurrency. +So there is not so much sense trying to prefetch also next leaf page. + +As far as it is difficult to estimate number of entries traversed by index scan, +we prefer not to prefetch large number of pages from the very beginning. +Such useless prefetch can reduce the performance of point lookups. +Instead of it we start with smallest prefetch distance and increase it +by INCREASE_PREFETCH_DISTANCE_STEP after processing each item +until it reaches effective_io_concurrency. In case of index-only +scan we increase prefetch distance after processing each leaf pages +and for index scan - after processing each tuple. +The only exception is case when no key bounds are specified. +In this case we traverse the whole relation and it makes sense +to start with the largest possible prefetch distance from the very beginning. diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c index aa82cede30a..c2a7907ca47 100644 --- a/src/backend/access/nbtree/nbtinsert.c +++ b/src/backend/access/nbtree/nbtinsert.c @@ -2165,7 +2165,7 @@ _bt_insert_parent(Relation rel, BlockNumberIsValid(RelationGetTargetBlock(rel)))); /* Find the leftmost page at the next level up */ - pbuf = _bt_get_endpoint(rel, opaque->btpo_level + 1, false); + pbuf = _bt_get_endpoint(rel, opaque->btpo_level + 1, false, NULL); /* Set up a phony stack entry pointing there */ stack = &fakestack; stack->bts_blkno = BufferGetBlockNumber(pbuf); diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index fdff960c130..1bc1263dd9c 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -35,6 +35,7 @@ #include "utils/fmgrprotos.h" #include "utils/index_selfuncs.h" #include "utils/memutils.h" +#include "utils/spccache.h" /* @@ -363,6 +364,7 @@ btbeginscan(Relation rel, int nkeys, int norderbys) so->killedItems = NULL; /* until needed */ so->numKilled = 0; + so->prefetch_maximum = 0; /* disable prefetch */ /* * We don't know yet whether the scan will be index-only, so we do not diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c index 4af1ff1e9e5..4ffdfdd9592 100644 --- a/src/backend/access/nbtree/nbtsearch.c +++ b/src/backend/access/nbtree/nbtsearch.c @@ -18,11 +18,14 @@ #include "access/nbtree.h" #include "access/relscan.h" #include "access/xact.h" +#include "catalog/catalog.h" #include "miscadmin.h" +#include "optimizer/cost.h" #include "pgstat.h" #include "storage/predicate.h" #include "utils/lsyscache.h" #include "utils/rel.h" +#include "utils/spccache.h" static inline void _bt_drop_lock_and_maybe_pin(Relation rel, BTScanOpaque so); @@ -53,6 +56,7 @@ static Buffer _bt_lock_and_validate_left(Relation rel, BlockNumber *blkno, BlockNumber lastcurrblkno); static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir); +#define INCREASE_PREFETCH_DISTANCE_STEP 1 /* * _bt_drop_lock_and_maybe_pin() @@ -863,6 +867,70 @@ _bt_compare(Relation rel, return 0; } + +/* + * _bt_read_parent_for_prefetch - read parent page and extract references to children for prefetch. + */ +static void +_bt_read_parent_for_prefetch(IndexScanDesc scan, BlockNumber parent, ScanDirection dir) +{ + Relation rel = scan->indexRelation; + BTScanOpaque so = (BTScanOpaque) scan->opaque; + Buffer buf; + Page page; + BTPageOpaque opaque; + OffsetNumber offnum; + OffsetNumber n_child; + int next_parent_prefetch_index; + int i, j; + + buf = _bt_getbuf(rel, parent, BT_READ); + page = BufferGetPage(buf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + offnum = P_FIRSTDATAKEY(opaque); + n_child = PageGetMaxOffsetNumber(page) - offnum + 1; + + /* Position where we should insert prefetch of parent page: we intentionally use prefetch_maximum here instead of current_prefetch_distance, + * assuming that it will reach prefetch_maximum before we reach and of the parent page + */ + next_parent_prefetch_index = (n_child > so->prefetch_maximum) + ? n_child - so->prefetch_maximum : 0; + + if (ScanDirectionIsForward(dir)) + { + so->next_parent = opaque->btpo_next; + if (so->next_parent == P_NONE) + next_parent_prefetch_index = -1; + + for (i = 0, j = 0; i < n_child; i++) + { + ItemId itemid = PageGetItemId(page, offnum + i); + IndexTuple itup = (IndexTuple) PageGetItem(page, itemid); + if (i == next_parent_prefetch_index) + so->prefetch_blocks[j++] = so->next_parent; /* time to prefetch next parent page */ + so->prefetch_blocks[j++] = BTreeTupleGetDownLink(itup); + } + } + else + { + so->next_parent = opaque->btpo_prev; + if (so->next_parent == P_NONE) + next_parent_prefetch_index = -1; + + for (i = 0, j = 0; i < n_child; i++) + { + ItemId itemid = PageGetItemId(page, offnum + n_child - i - 1); + IndexTuple itup = (IndexTuple) PageGetItem(page, itemid); + if (i == next_parent_prefetch_index) + so->prefetch_blocks[j++] = so->next_parent; /* time to prefetch next parent page */ + so->prefetch_blocks[j++] = BTreeTupleGetDownLink(itup); + } + } + so->n_prefetch_blocks = j; + so->last_prefetch_index = 0; + _bt_relbuf(rel, buf); +} + /* * _bt_first() -- Find the first item in a scan. * @@ -1241,6 +1309,37 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) } } + /* Neon: initialize prefetch */ + so->n_prefetch_requests = 0; + so->n_prefetch_blocks = 0; + so->last_prefetch_index = 0; + so->next_parent = P_NONE; + so->prefetch_maximum = IsCatalogRelation(rel) + ? effective_io_concurrency + : get_tablespace_io_concurrency(rel->rd_rel->reltablespace); + + if (scan->xs_want_itup) /* index only scan */ + { + if (enable_indexonlyscan_prefetch) + { + /* We disable prefetch for parallel index-only scan. + * Neon prefetch is efficient only if prefetched blocks are accessed by the same worker + * which issued prefetch request. The logic of splitting pages between parallel workers in + * index scan doesn't allow to satisfy this requirement. + * Also prefetch of leave pages will be useless if expected number of rows fits in one page. + */ + if (scan->parallel_scan) + so->prefetch_maximum = 0; /* disable prefetch */ + } + else + so->prefetch_maximum = 0; /* disable prefetch */ + } + else if (!enable_indexscan_prefetch || !scan->heapRelation) + so->prefetch_maximum = 0; /* disable prefetch */ + + /* Start with minimal prefetch distance to minimize prefetch overhead for exact or limited index scans */ + so->current_prefetch_distance = 0; + /* * If we found no usable boundary keys, we have to start from one end of * the tree. Walk down that edge to the first or last key, and scan from @@ -1517,6 +1616,35 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) Assert(ScanDirectionIsBackward(dir) == inskey.backward); stack = _bt_search(rel, NULL, &inskey, &so->currPos.buf, BT_READ); + /* Start prefetching for index only scan */ + if (so->prefetch_maximum > 0 && stack != NULL && scan->xs_want_itup) /* index only scan */ + { + BlockNumber leaf = BufferGetBlockNumber(so->currPos.buf); + + _bt_read_parent_for_prefetch(scan, stack->bts_blkno, dir); + + /* + * We can not use stack->bts_offset because once parent page is unlocked, it can be updated and state captured by + * by _bt_read_parent_for_prefetch may not match with _bt_search + */ + + for (int j = 0; j < so->n_prefetch_blocks; j++) + { + if (so->prefetch_blocks[j] == leaf) + { + so->current_prefetch_distance = INCREASE_PREFETCH_DISTANCE_STEP; + so->n_prefetch_requests = Min(so->current_prefetch_distance, so->n_prefetch_blocks - j); + so->last_prefetch_index = j + so->n_prefetch_requests; + + do { + PrefetchBuffer(rel, MAIN_FORKNUM, so->prefetch_blocks[j]); + } while (++j < so->last_prefetch_index); + + break; + } + } + } + /* don't need to keep the stack around... */ _bt_freestack(stack); @@ -1620,6 +1748,62 @@ _bt_next(IndexScanDesc scan, ScanDirection dir) } _bt_returnitem(scan, so); + + /* Neon: Prefetch pointed-to heap pages */ + if (!scan->xs_want_itup && so->prefetch_maximum > 0) + { + int prefetchLimit, prefetchDistance; + + /* Neon: prefetch referenced heap pages. + * As far as it is difficult to predict how much items index scan will return + * we do not want to prefetch many heap pages from the very beginning because + * them may not be needed. So we are going to increase prefetch distance by INCREASE_PREFETCH_DISTANCE_STEP + * at each index scan iteration until it reaches prefetch_maximum. + */ + + /* Advance pefetch distance until it reaches prefetch_maximum */ + if (so->current_prefetch_distance + INCREASE_PREFETCH_DISTANCE_STEP <= so->prefetch_maximum) + so->current_prefetch_distance += INCREASE_PREFETCH_DISTANCE_STEP; + else + so->current_prefetch_distance = so->prefetch_maximum; + + /* How much we can prefetch */ + prefetchLimit = Min(so->current_prefetch_distance, so->currPos.lastItem - so->currPos.firstItem + 1); + + /* Active prefeth requests */ + prefetchDistance = so->n_prefetch_requests; + + /* + * Consume one prefetch request (if any) + */ + if (prefetchDistance != 0) + prefetchDistance -= 1; + + /* Keep number of active prefetch requests equal to the current prefetch distance. + * When prefetch distance reaches prefetch maximum, this loop performs at most one iteration, + * but at the beginning of index scan it performs up to INCREASE_PREFETCH_DISTANCE_STEP+1 iterations + */ + if (ScanDirectionIsForward(dir)) + { + while (prefetchDistance < prefetchLimit && so->currPos.itemIndex + prefetchDistance <= so->currPos.lastItem) + { + BlockNumber blkno = BlockIdGetBlockNumber(&so->currPos.items[so->currPos.itemIndex + prefetchDistance].heapTid.ip_blkid); + PrefetchBuffer(scan->heapRelation, MAIN_FORKNUM, blkno); + prefetchDistance += 1; + } + } + else + { + while (prefetchDistance < prefetchLimit && so->currPos.itemIndex - prefetchDistance >= so->currPos.firstItem) + { + BlockNumber blkno = BlockIdGetBlockNumber(&so->currPos.items[so->currPos.itemIndex - prefetchDistance].heapTid.ip_blkid); + PrefetchBuffer(scan->heapRelation, MAIN_FORKNUM, blkno); + prefetchDistance += 1; + } + } + so->n_prefetch_requests = prefetchDistance; /* update number of active prefetch requests */ + } + return true; } @@ -2152,6 +2336,8 @@ _bt_returnitem(IndexScanDesc scan, BTScanOpaque so) scan->xs_heaptid = currItem->heapTid; if (so->currTuples) scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset); + + } /* @@ -2224,6 +2410,30 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir) BTScanPosUnpinIfPinned(so->currPos); + if (scan->xs_want_itup && so->prefetch_maximum > 0) /* Prefetching of leaf pages for index-only scan */ + { + /* Advance pefetch distance until it reaches prefetch_maximum */ + if (so->current_prefetch_distance + INCREASE_PREFETCH_DISTANCE_STEP <= so->prefetch_maximum) + so->current_prefetch_distance += INCREASE_PREFETCH_DISTANCE_STEP; + + so->n_prefetch_requests -= 1; /* we load next leaf page, so decrement number of active prefetch requests */ + + /* Check if the are more children to prefetch at current parent page */ + if (so->last_prefetch_index == so->n_prefetch_blocks && so->next_parent != P_NONE) + { + /* we have prefetched all items from current parent page, let's move to the next parent page */ + _bt_read_parent_for_prefetch(scan, so->next_parent, dir); + so->n_prefetch_requests -= 1; /* loading parent page consumes one more prefetch request */ + } + + /* Try to keep number of active prefetch requests equal to current prefetch distance */ + while (so->n_prefetch_requests < so->current_prefetch_distance && so->last_prefetch_index < so->n_prefetch_blocks) + { + so->n_prefetch_requests += 1; + PrefetchBuffer(scan->indexRelation, MAIN_FORKNUM, so->prefetch_blocks[so->last_prefetch_index++]); + } + } + /* Walk to the next page with data */ if (ScanDirectionIsForward(dir)) blkno = so->currPos.nextPage; @@ -2612,7 +2822,8 @@ _bt_lock_and_validate_left(Relation rel, BlockNumber *blkno, * The returned buffer is pinned and read-locked. */ Buffer -_bt_get_endpoint(Relation rel, uint32 level, bool rightmost) +_bt_get_endpoint(Relation rel, uint32 level, bool rightmost, + BlockNumber* parent) { Buffer buf; Page page; @@ -2620,6 +2831,7 @@ _bt_get_endpoint(Relation rel, uint32 level, bool rightmost) OffsetNumber offnum; BlockNumber blkno; IndexTuple itup; + BlockNumber parent_blocknum = P_NONE; /* * If we are looking for a leaf page, okay to descend from fast root; @@ -2636,6 +2848,7 @@ _bt_get_endpoint(Relation rel, uint32 level, bool rightmost) page = BufferGetPage(buf); opaque = BTPageGetOpaque(page); + blkno = BufferGetBlockNumber(buf); for (;;) { @@ -2673,6 +2886,7 @@ _bt_get_endpoint(Relation rel, uint32 level, bool rightmost) offnum = P_FIRSTDATAKEY(opaque); itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum)); + parent_blocknum = blkno; blkno = BTreeTupleGetDownLink(itup); buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ); @@ -2680,6 +2894,9 @@ _bt_get_endpoint(Relation rel, uint32 level, bool rightmost) opaque = BTPageGetOpaque(page); } + if (parent) + *parent = parent_blocknum; + return buf; } @@ -2702,6 +2919,7 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir) Page page; BTPageOpaque opaque; OffsetNumber start; + BlockNumber parent; Assert(!BTScanPosIsValid(so->currPos)); Assert(!so->needPrimScan); @@ -2710,7 +2928,8 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir) * Scan down to the leftmost or rightmost leaf page. This is a simplified * version of _bt_search(). */ - so->currPos.buf = _bt_get_endpoint(rel, 0, ScanDirectionIsBackward(dir)); + so->currPos.buf = _bt_get_endpoint(rel, 0, ScanDirectionIsBackward(dir), + &parent); if (!BufferIsValid(so->currPos.buf)) { @@ -2723,6 +2942,17 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir) return false; } + /* Start prefetching for index-only scan */ + if (so->prefetch_maximum > 0 && parent != P_NONE && scan->xs_want_itup) /* index only scan */ + { + _bt_read_parent_for_prefetch(scan, parent, dir); + so->n_prefetch_requests = so->last_prefetch_index = + Min(so->prefetch_maximum, so->n_prefetch_blocks); + + for (int i = 0; i < so->last_prefetch_index; i++) + PrefetchBuffer(rel, MAIN_FORKNUM, so->prefetch_blocks[i]); + } + page = BufferGetPage(so->currPos.buf); opaque = BTPageGetOpaque(page); Assert(P_ISLEAF(opaque)); diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c index 3d44815ed5a..11f67467d45 100644 --- a/src/backend/optimizer/path/costsize.c +++ b/src/backend/optimizer/path/costsize.c @@ -163,6 +163,8 @@ bool enable_parallel_hash = true; bool enable_partition_pruning = true; bool enable_presorted_aggregate = true; bool enable_async_append = true; +bool enable_indexscan_prefetch = true; +bool enable_indexonlyscan_prefetch = true; typedef struct { diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 384677bf5ef..9fef9979232 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -61,6 +61,7 @@ #include "storage/read_stream.h" #include "storage/smgr.h" #include "storage/standby.h" +#include "replication/walsender.h" #include "utils/memdebug.h" #include "utils/ps_status.h" #include "utils/rel.h" diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c index 07283180592..dd11cd1c223 100644 --- a/src/backend/utils/misc/guc_tables.c +++ b/src/backend/utils/misc/guc_tables.c @@ -790,6 +790,26 @@ StaticAssertDecl(lengthof(config_type_names) == (PGC_ENUM + 1), struct config_bool ConfigureNamesBool[] = { + { + {"enable_indexscan_prefetch", PGC_USERSET, RESOURCES_IO, + gettext_noop("Enables prefetching of heap pages in index scans."), + NULL, + GUC_EXPLAIN + }, + &enable_indexscan_prefetch, + true, + NULL, NULL, NULL + }, + { + {"enable_indexonlyscan_prefetch", PGC_USERSET, RESOURCES_IO, + gettext_noop("Enables prefetching of leave pages in index-only scans."), + NULL, + GUC_EXPLAIN + }, + &enable_indexonlyscan_prefetch, + true, + NULL, NULL, NULL + }, { {"enable_seqscan", PGC_USERSET, QUERY_TUNING_METHOD, gettext_noop("Enables the planner's use of sequential-scan plans."), diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index e709d2e0afe..9b1c242cb81 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -1092,6 +1092,22 @@ typedef struct BTScanOpaqueData /* keep these last in struct for efficiency */ BTScanPosData currPos; /* current position data */ BTScanPosData markPos; /* marked position, if any */ + + /* Neon: prefetch state */ + int prefetch_maximum; /* maximal number of prefetch requests */ + + /* Prefech of referenced heap pages for index scan */ + /* To minimize waste prefetch requests we start with prefetch distance 0 + * and increase it until it reaches prefetch_maximum + */ + int current_prefetch_distance; + + /* Prefetch of leave pages of B-Tree for index-only scan */ + int n_prefetch_requests; /* number of active prefetch requests */ + int n_prefetch_blocks; /* number of elements in prefetch_blocks */ + int last_prefetch_index; /* current position in prefetch_blocks (prefetch_blocks[0..last_prefetch_index] are already requested */ + BlockNumber next_parent; /* pointer to next parent page */ + BlockNumber prefetch_blocks[MaxTIDsPerBTreePage + 1]; /* leaves + parent page */ } BTScanOpaqueData; typedef BTScanOpaqueData *BTScanOpaque; @@ -1307,7 +1323,8 @@ extern OffsetNumber _bt_binsrch_insert(Relation rel, BTInsertState insertstate); extern int32 _bt_compare(Relation rel, BTScanInsert key, Page page, OffsetNumber offnum); extern bool _bt_first(IndexScanDesc scan, ScanDirection dir); extern bool _bt_next(IndexScanDesc scan, ScanDirection dir); -extern Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost); +extern Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost, + BlockNumber* parent); /* * prototypes for functions in nbtutils.c diff --git a/src/include/optimizer/cost.h b/src/include/optimizer/cost.h index d397fe27dc1..d743ca2c30c 100644 --- a/src/include/optimizer/cost.h +++ b/src/include/optimizer/cost.h @@ -70,6 +70,9 @@ extern PGDLLIMPORT bool enable_parallel_hash; extern PGDLLIMPORT bool enable_partition_pruning; extern PGDLLIMPORT bool enable_presorted_aggregate; extern PGDLLIMPORT bool enable_async_append; +extern PGDLLIMPORT bool enable_indexscan_prefetch; +extern PGDLLIMPORT bool enable_indexonlyscan_prefetch; + extern PGDLLIMPORT int constraint_exclusion; extern double index_pages_fetched(double tuples_fetched, BlockNumber pages, diff --git a/src/test/regress/expected/sysviews.out b/src/test/regress/expected/sysviews.out index 83228cfca29..91ce619defa 100644 --- a/src/test/regress/expected/sysviews.out +++ b/src/test/regress/expected/sysviews.out @@ -157,7 +157,9 @@ select name, setting from pg_settings where name like 'enable%'; enable_hashjoin | on enable_incremental_sort | on enable_indexonlyscan | on + enable_indexonlyscan_prefetch | on enable_indexscan | on + enable_indexscan_prefetch | on enable_material | on enable_memoize | on enable_mergejoin | on @@ -172,7 +174,7 @@ select name, setting from pg_settings where name like 'enable%'; enable_seqscan | on enable_sort | on enable_tidscan | on -(24 rows) +(26 rows) -- There are always wait event descriptions for various types. InjectionPoint -- may be present or absent, depending on history since last postmaster start. From e7bd8665ea1abff1fe744d710198fe08e0c278e6 Mon Sep 17 00:00:00 2001 From: Matthias van de Meent Date: Wed, 31 Jul 2024 18:20:35 +0200 Subject: [PATCH 11/50] NEON: Extend EXPLAIN with options for LFC and Prefetch IO metrics --- src/backend/commands/explain.c | 62 +++++++++++++++++++++++++++- src/backend/commands/explain_state.c | 16 +++++++ src/backend/executor/instrument.c | 16 ++++++- src/include/commands/explain_state.h | 2 + src/include/executor/instrument.h | 17 ++++++++ 5 files changed, 110 insertions(+), 3 deletions(-) diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c index 7e2792ead71..69101f9835e 100644 --- a/src/backend/commands/explain.c +++ b/src/backend/commands/explain.c @@ -144,6 +144,8 @@ static void show_foreignscan_info(ForeignScanState *fsstate, ExplainState *es); static const char *explain_get_index_name(Oid indexId); static bool peek_buffer_usage(ExplainState *es, const BufferUsage *usage); static void show_buffer_usage(ExplainState *es, const BufferUsage *usage); +static void show_prefetch_info(ExplainState *es, const PrefetchInfo* prefetch_info); +static void show_file_cache_info(ExplainState *es, const FileCacheInfo* file_cache_info); static void show_wal_usage(ExplainState *es, const WalUsage *usage); static void show_memory_counters(ExplainState *es, const MemoryContextCounters *mem_counters); @@ -2287,6 +2289,14 @@ ExplainNode(PlanState *planstate, List *ancestors, if (es->wal && planstate->instrument) show_wal_usage(es, &planstate->instrument->walusage); + /* Show prefetch usage */ + if (es->prefetch && planstate->instrument) + show_prefetch_info(es, &planstate->instrument->bufusage.prefetch); + + /* Show file cache usage */ + if (es->file_cache && planstate->instrument) + show_file_cache_info(es, &planstate->instrument->bufusage.file_cache); + /* Prepare per-worker buffer/WAL usage */ if (es->workers_state && (es->buffers || es->wal) && es->verbose) { @@ -4080,7 +4090,57 @@ peek_buffer_usage(ExplainState *es, const BufferUsage *usage) } /* - * Show buffer usage details. This better be sync with peek_buffer_usage. + * Show prefetch statistics + */ +static void +show_prefetch_info(ExplainState *es, const PrefetchInfo* prefetch_info) +{ + if (es->format == EXPLAIN_FORMAT_TEXT) + { + ExplainIndentText(es); + appendStringInfo(es->str, "Prefetch: hits=%lld misses=%lld expired=%lld duplicates=%lld\n", + (long long) prefetch_info->hits, + (long long) prefetch_info->misses, + (long long) prefetch_info->expired, + (long long) prefetch_info->duplicates); + } + else + { + ExplainPropertyInteger("Prefetch Hits", NULL, + prefetch_info->hits, es); + ExplainPropertyInteger("Prefetch Misses", NULL, + prefetch_info->misses, es); + ExplainPropertyInteger("Prefetch Expired Requests", NULL, + prefetch_info->expired, es); + ExplainPropertyInteger("Prefetch Duplicated Requests", NULL, + prefetch_info->duplicates, es); + } +} + +/* + * Show local file cache statistics + */ +static void +show_file_cache_info(ExplainState *es, const FileCacheInfo* file_cache_info) +{ + if (es->format == EXPLAIN_FORMAT_TEXT) + { + ExplainIndentText(es); + appendStringInfo(es->str, "File cache: hits=%lld misses=%lld\n", + (long long) file_cache_info->hits, + (long long) file_cache_info->misses); + } + else + { + ExplainPropertyInteger("File Cache Hits", NULL, + file_cache_info->hits, es); + ExplainPropertyInteger("File Cache Misses", NULL, + file_cache_info->misses, es); + } +} + +/* + * Show buffer usage details. This better be synced with peek_buffer_usage. */ static void show_buffer_usage(ExplainState *es, const BufferUsage *usage) diff --git a/src/backend/commands/explain_state.c b/src/backend/commands/explain_state.c index 60d98d63a62..fcb6c8468d4 100644 --- a/src/backend/commands/explain_state.c +++ b/src/backend/commands/explain_state.c @@ -97,6 +97,10 @@ ParseExplainOptionList(ExplainState *es, List *options, ParseState *pstate) buffers_set = true; es->buffers = defGetBoolean(opt); } + else if (strcmp(opt->defname, "prefetch") == 0) + es->prefetch = defGetBoolean(opt); + else if (strcmp(opt->defname, "filecache") == 0) + es->file_cache = defGetBoolean(opt); else if (strcmp(opt->defname, "wal") == 0) es->wal = defGetBoolean(opt); else if (strcmp(opt->defname, "settings") == 0) @@ -179,6 +183,18 @@ ParseExplainOptionList(ExplainState *es, List *options, ParseState *pstate) /* if the buffers was not set explicitly, set default value */ es->buffers = (buffers_set) ? es->buffers : es->analyze; + /* check that prefetch is used with EXPLAIN (BUFFERS) */ + if (es->prefetch && !es->buffers) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("EXPLAIN option %s requires BUFFERS", "PREFETCH"))); + + /* check that filecache is used with EXPLAIN (BUFFERS) */ + if (es->file_cache && !es->buffers) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("EXPLAIN option %s requires BUFFERS", "filecache"))); + /* check that timing is used with EXPLAIN ANALYZE */ if (es->timing && !es->analyze) ereport(ERROR, diff --git a/src/backend/executor/instrument.c b/src/backend/executor/instrument.c index 56e635f4700..87d33dc6fa3 100644 --- a/src/backend/executor/instrument.c +++ b/src/backend/executor/instrument.c @@ -239,8 +239,13 @@ BufferUsageAdd(BufferUsage *dst, const BufferUsage *add) INSTR_TIME_ADD(dst->shared_blk_write_time, add->shared_blk_write_time); INSTR_TIME_ADD(dst->local_blk_read_time, add->local_blk_read_time); INSTR_TIME_ADD(dst->local_blk_write_time, add->local_blk_write_time); - INSTR_TIME_ADD(dst->temp_blk_read_time, add->temp_blk_read_time); - INSTR_TIME_ADD(dst->temp_blk_write_time, add->temp_blk_write_time); + /* Neon-specific stats*/ + dst->prefetch.hits += add->prefetch.hits; + dst->prefetch.misses += add->prefetch.misses; + dst->prefetch.expired += add->prefetch.expired; + dst->prefetch.duplicates += add->prefetch.duplicates; + dst->file_cache.hits += add->file_cache.hits; + dst->file_cache.misses += add->file_cache.misses; } /* dst += add - sub */ @@ -271,6 +276,13 @@ BufferUsageAccumDiff(BufferUsage *dst, add->temp_blk_read_time, sub->temp_blk_read_time); INSTR_TIME_ACCUM_DIFF(dst->temp_blk_write_time, add->temp_blk_write_time, sub->temp_blk_write_time); + /* Neon-only stats */ + dst->prefetch.hits += add->prefetch.hits - sub->prefetch.hits; + dst->prefetch.misses += add->prefetch.misses - sub->prefetch.misses; + dst->prefetch.expired += add->prefetch.expired - sub->prefetch.expired; + dst->prefetch.duplicates += add->prefetch.duplicates - sub->prefetch.duplicates; + dst->file_cache.hits += add->file_cache.hits - sub->file_cache.hits; + dst->file_cache.misses += add->file_cache.misses - sub->file_cache.misses; } /* helper functions for WAL usage accumulation */ diff --git a/src/include/commands/explain_state.h b/src/include/commands/explain_state.h index 32728f5d1a1..c5a7e00e424 100644 --- a/src/include/commands/explain_state.h +++ b/src/include/commands/explain_state.h @@ -49,6 +49,8 @@ typedef struct ExplainState bool analyze; /* print actual times */ bool costs; /* print estimated costs */ bool buffers; /* print buffer usage */ + bool prefetch; /* print buffer prefetch stats */ + bool file_cache; /* print LFC stats */ bool wal; /* print WAL usage */ bool timing; /* print detailed node timing */ bool summary; /* print total planning and execution timing */ diff --git a/src/include/executor/instrument.h b/src/include/executor/instrument.h index 03653ab6c6c..4c0d6754e09 100644 --- a/src/include/executor/instrument.h +++ b/src/include/executor/instrument.h @@ -15,6 +15,21 @@ #include "portability/instr_time.h" +/* Prefeth statistics */ +typedef struct +{ + int64 hits; + int64 misses; + int64 expired; + int64 duplicates; +} PrefetchInfo; + +/* Local file cache statistics */ +typedef struct +{ + int64 hits; + int64 misses; +} FileCacheInfo; /* * BufferUsage and WalUsage counters keep being incremented infinitely, @@ -39,6 +54,8 @@ typedef struct BufferUsage instr_time local_blk_write_time; /* time spent writing local blocks */ instr_time temp_blk_read_time; /* time spent reading temp blocks */ instr_time temp_blk_write_time; /* time spent writing temp blocks */ + PrefetchInfo prefetch; /* prefetch statistics */ + FileCacheInfo file_cache; /* local file cache statistics */ } BufferUsage; /* From 0c9b671bb120827cf138ca054e2da4f20acd00ca Mon Sep 17 00:00:00 2001 From: Matthias van de Meent Date: Wed, 31 Jul 2024 18:23:13 +0200 Subject: [PATCH 12/50] NEON: Adjust visibilitymap_set to always log hint bits --- src/backend/access/heap/visibilitymap.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/backend/access/heap/visibilitymap.c b/src/backend/access/heap/visibilitymap.c index 745a04ef26e..57bd3a8fe29 100644 --- a/src/backend/access/heap/visibilitymap.c +++ b/src/backend/access/heap/visibilitymap.c @@ -301,7 +301,8 @@ visibilitymap_set(Relation rel, BlockNumber heapBlk, Buffer heapBuf, * WAL record inserted above, so it would be incorrect to * update the heap page's LSN. */ - if (XLogHintBitIsNeeded()) + /* NEON: we have to update page LSN even if wal_log_hints=off */ + if (true || XLogHintBitIsNeeded()) { Page heapPage = BufferGetPage(heapBuf); From 6000327ab85cb6522893a1fe54226781d1ce4831 Mon Sep 17 00:00:00 2001 From: Matthias van de Meent Date: Thu, 3 Jul 2025 10:01:10 +0200 Subject: [PATCH 13/50] NEON: Allow privileged roles to use some administrative commands In neon.com, the privileged role will be neon_superuser, but other platforms may need a different role name, hence the parameter. --- src/backend/commands/publicationcmds.c | 4 ++-- src/backend/utils/adt/acl.c | 26 ++++++++++++++++++++++++++ src/include/miscadmin.h | 4 ++++ 3 files changed, 32 insertions(+), 2 deletions(-) diff --git a/src/backend/commands/publicationcmds.c b/src/backend/commands/publicationcmds.c index 803c26ab216..6dda800a89d 100644 --- a/src/backend/commands/publicationcmds.c +++ b/src/backend/commands/publicationcmds.c @@ -849,7 +849,7 @@ CreatePublication(ParseState *pstate, CreatePublicationStmt *stmt) get_database_name(MyDatabaseId)); /* FOR ALL TABLES requires superuser */ - if (stmt->for_all_tables && !superuser()) + if (stmt->for_all_tables && !superuser() && !is_privileged_role()) ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), errmsg("must be superuser to create FOR ALL TABLES publication"))); @@ -924,7 +924,7 @@ CreatePublication(ParseState *pstate, CreatePublicationStmt *stmt) &schemaidlist); /* FOR TABLES IN SCHEMA requires superuser */ - if (schemaidlist != NIL && !superuser()) + if (schemaidlist != NIL && !superuser() && !is_privileged_role()) ereport(ERROR, errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), errmsg("must be superuser to create FOR TABLES IN SCHEMA publication")); diff --git a/src/backend/utils/adt/acl.c b/src/backend/utils/adt/acl.c index 1213f9106d5..fc6f953d51a 100644 --- a/src/backend/utils/adt/acl.c +++ b/src/backend/utils/adt/acl.c @@ -133,6 +133,32 @@ static AclResult pg_role_aclcheck(Oid role_oid, Oid roleid, AclMode mode); static void RoleMembershipCacheCallback(Datum arg, int cacheid, uint32 hashvalue); +/* + * Name of the user-accessible privileged role in this system. + * Generally neon_superuser on neon.com + */ +char *privileged_role_name = NULL; + +/* Is the current user the designated privileged role */ +bool +is_privileged_role(void) +{ + return is_privileged_role_arg(GetUserId()); +} + +/* Does this user (by OID) inherit the privileged role? */ +bool +is_privileged_role_arg(Oid roleid) +{ + Oid privileged_role_oid; + + if (privileged_role_name == NULL) + return false; + + privileged_role_oid = get_role_oid(privileged_role_name, true /* missing_ok */); + + return privileged_role_oid != InvalidOid && has_privs_of_role(roleid, privileged_role_oid); +} /* * Test whether an identifier char can be left unquoted in ACLs. diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h index e788c7aecdc..53db7708a6a 100644 --- a/src/include/miscadmin.h +++ b/src/include/miscadmin.h @@ -443,6 +443,10 @@ extern const char *GetSystemUser(void); extern bool superuser(void); /* current user is superuser */ extern bool superuser_arg(Oid roleid); /* given user is superuser */ +/* in utils/adt/acl.c */ +extern PGDLLIMPORT char *privileged_role_name; +extern bool is_privileged_role(void); /* current user is a privileged role */ +extern bool is_privileged_role_arg(Oid roleid); /* given user is a privileged role */ /***************************************************************************** * pmod.h -- * From 9c5a6b0fcc9a7fd98e2a05cac2da0a96cdbd55e5 Mon Sep 17 00:00:00 2001 From: Matthias van de Meent Date: Thu, 3 Jul 2025 10:02:03 +0200 Subject: [PATCH 14/50] NEON: Add efficient 'new page' support in XLogRecordAssemble --- src/backend/access/transam/xloginsert.c | 37 +++++++++++++++++-------- 1 file changed, 25 insertions(+), 12 deletions(-) diff --git a/src/backend/access/transam/xloginsert.c b/src/backend/access/transam/xloginsert.c index 679c70f13ae..36a1e3d7e7e 100644 --- a/src/backend/access/transam/xloginsert.c +++ b/src/backend/access/transam/xloginsert.c @@ -669,22 +669,30 @@ XLogRecordAssemble(RmgrId rmid, uint8 info, */ if (regbuf->flags & REGBUF_STANDARD) { - /* Assume we can omit data between pd_lower and pd_upper */ - uint16 lower = ((PageHeader) page)->pd_lower; - uint16 upper = ((PageHeader) page)->pd_upper; - - if (lower >= SizeOfPageHeaderData && - upper > lower && - upper <= BLCKSZ) + if (PageIsNew(page)) { - bimg.hole_offset = lower; - cbimg.hole_length = upper - lower; + bimg.hole_offset = 0; + cbimg.hole_length = BLCKSZ; } else { - /* No "hole" to remove */ - bimg.hole_offset = 0; - cbimg.hole_length = 0; + /* Assume we can omit data between pd_lower and pd_upper */ + uint16 lower = ((PageHeader) page)->pd_lower; + uint16 upper = ((PageHeader) page)->pd_upper; + + if (lower >= SizeOfPageHeaderData && + upper > lower && + upper <= BLCKSZ) + { + bimg.hole_offset = lower; + cbimg.hole_length = upper - lower; + } + else + { + /* No "hole" to remove */ + bimg.hole_offset = 0; + cbimg.hole_length = 0; + } } } else @@ -778,6 +786,11 @@ XLogRecordAssemble(RmgrId rmid, uint8 info, rdt_datas_last->data = page; rdt_datas_last->len = BLCKSZ; } + else if (bimg.length == 0) + { + rdt_datas_last->data = page; + rdt_datas_last->len = 0; + } else { /* must skip the hole */ From 78c4718d9bdfebd2582115b350197204c5bc0d3b Mon Sep 17 00:00:00 2001 From: Matthias van de Meent Date: Thu, 3 Jul 2025 11:56:40 +0200 Subject: [PATCH 15/50] NEON: WAL-log various files' state changes Used for Logical Replication and various other system state changes. Required to keep state synchronized with Pageserver's Basebackup output, as we don't have systems in place to make direct writes at an LSN possible. TODO: Enforce using a subscription's FAILOVER option instead for LR? Co-authored-by: Konstantin Knizhnik --- src/backend/access/heap/rewriteheap.c | 45 ++++++++++++++++++++- src/backend/replication/logical/logical.c | 4 ++ src/backend/replication/logical/origin.c | 1 + src/backend/replication/logical/snapbuild.c | 17 ++++++++ src/backend/replication/slot.c | 19 +++++++++ 5 files changed, 85 insertions(+), 1 deletion(-) diff --git a/src/backend/access/heap/rewriteheap.c b/src/backend/access/heap/rewriteheap.c index e6d2b5fced1..be36907e852 100644 --- a/src/backend/access/heap/rewriteheap.c +++ b/src/backend/access/heap/rewriteheap.c @@ -115,6 +115,8 @@ #include "lib/ilist.h" #include "miscadmin.h" #include "pgstat.h" +#include "replication/logical.h" +#include "replication/message.h" #include "replication/slot.h" #include "storage/bufmgr.h" #include "storage/bulk_write.h" @@ -750,6 +752,41 @@ raw_heap_insert(RewriteState state, HeapTuple tup) * ------------------------------------------------------------------------ */ +/* + * NEON: we need to persist mapping file in WAL + */ +static void +wallog_mapping_file(char const* path, int fd) +{ + char prefix[MAXPGPATH]; + + /* Do not wallog AUX file at replica */ + if (!XLogInsertAllowed()) + return; + + snprintf(prefix, sizeof(prefix), "neon-file:%s", path); + if (fd < 0) + { + elog(DEBUG1, "neon: deleting contents of rewrite file %s", path); + /* unlink file */ + LogLogicalMessage(prefix, NULL, 0, false, false); + } + else + { + off_t size = lseek(fd, 0, SEEK_END); + char* buf; + elog(DEBUG1, "neon: writing contents of rewrite file %s, size %ld", path, (long)size); + if (size < 0) + elog(ERROR, "Failed to get size of mapping file: %m"); + buf = palloc((size_t)size); + lseek(fd, 0, SEEK_SET); + if (read(fd, buf, (size_t)size) != size) + elog(ERROR, "Failed to read mapping file: %m"); + LogLogicalMessage(prefix, buf, (size_t)size, false, false); + pfree(buf); + } +} + /* * Do preparations for logging logical mappings during a rewrite if * necessary. If we detect that we don't need to log anything we'll prevent @@ -885,6 +922,7 @@ logical_heap_rewrite_flush_mappings(RewriteState state) errmsg("could not write to file \"%s\", wrote %d of %d: %m", src->path, written, len))); src->off += len; + wallog_mapping_file(src->path, FileGetRawDesc(src->vfd)); XLogBeginInsert(); XLogRegisterData(&xlrec, sizeof(xlrec)); @@ -970,7 +1008,7 @@ logical_rewrite_log_mapping(RewriteState state, TransactionId xid, src->off = 0; memcpy(src->path, path, sizeof(path)); src->vfd = PathNameOpenFile(path, - O_CREAT | O_EXCL | O_WRONLY | PG_BINARY); + O_CREAT | O_EXCL | O_RDWR | PG_BINARY); if (src->vfd < 0) ereport(ERROR, (errcode_for_file_access(), @@ -1135,6 +1173,8 @@ heap_xlog_logical_rewrite(XLogReaderState *r) errmsg("could not fsync file \"%s\": %m", path))); pgstat_report_wait_end(); + wallog_mapping_file(path, fd); + if (CloseTransientFile(fd) != 0) ereport(ERROR, (errcode_for_file_access(), @@ -1212,6 +1252,7 @@ CheckPointLogicalRewriteHeap(void) ereport(ERROR, (errcode_for_file_access(), errmsg("could not remove file \"%s\": %m", path))); + wallog_mapping_file(path, -1); } else { @@ -1240,6 +1281,8 @@ CheckPointLogicalRewriteHeap(void) errmsg("could not fsync file \"%s\": %m", path))); pgstat_report_wait_end(); + wallog_mapping_file(path, fd); + if (CloseTransientFile(fd) != 0) ereport(ERROR, (errcode_for_file_access(), diff --git a/src/backend/replication/logical/logical.c b/src/backend/replication/logical/logical.c index f1eb798f3e9..de7c85499bc 100644 --- a/src/backend/replication/logical/logical.c +++ b/src/backend/replication/logical/logical.c @@ -1239,6 +1239,8 @@ message_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn, if (ctx->callbacks.message_cb == NULL) return; + if (strncmp(prefix, "neon-file", strlen("neon-file")) == 0) + return; /* Push callback + info on the error context stack */ state.ctx = ctx; @@ -1554,6 +1556,8 @@ stream_message_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn, /* this callback is optional */ if (ctx->callbacks.stream_message_cb == NULL) return; + if (strncmp(prefix, "neon-file", strlen("neon-file")) == 0) + return; /* Push callback + info on the error context stack */ state.ctx = ctx; diff --git a/src/backend/replication/logical/origin.c b/src/backend/replication/logical/origin.c index a17bacf88e7..d50ce5c901b 100644 --- a/src/backend/replication/logical/origin.c +++ b/src/backend/replication/logical/origin.c @@ -82,6 +82,7 @@ #include "miscadmin.h" #include "nodes/execnodes.h" #include "pgstat.h" +#include "replication/logical.h" #include "replication/origin.h" #include "replication/slot.h" #include "storage/condition_variable.h" diff --git a/src/backend/replication/logical/snapbuild.c b/src/backend/replication/logical/snapbuild.c index adf18c397db..73e1891f8da 100644 --- a/src/backend/replication/logical/snapbuild.c +++ b/src/backend/replication/logical/snapbuild.c @@ -132,6 +132,7 @@ #include "miscadmin.h" #include "pgstat.h" #include "replication/logical.h" +#include "replication/message.h" #include "replication/reorderbuffer.h" #include "replication/snapbuild.h" #include "replication/snapbuild_internal.h" @@ -1650,6 +1651,14 @@ SnapBuildSerialize(SnapBuild *builder, XLogRecPtr lsn) (errcode_for_file_access(), errmsg("could not open file \"%s\": %m", tmppath))); + /* Do not wallog AUX file at replica */ + if (XLogInsertAllowed()) + { + char prefix[MAXPGPATH + sizeof("neon-file:")]; + /* NEON specific: persist snapshot in storage using logical message */ + snprintf(prefix, sizeof(prefix), "neon-file:%s", path); + LogLogicalMessage(prefix, (char *) ondisk, needed_length, false, true); + } errno = 0; pgstat_report_wait_start(WAIT_EVENT_SNAPBUILD_WRITE); if ((write(fd, ondisk, needed_length)) != needed_length) @@ -2033,6 +2042,14 @@ CheckPointSnapBuild(void) { elog(DEBUG1, "removing snapbuild snapshot %s", path); + /* Do not wallog AUX file at replica */ + if (XLogInsertAllowed()) + { + char prefix[MAXPGPATH + sizeof(PG_LOGICAL_SNAPSHOTS_DIR) + sizeof("neon-file:")]; + /* NEON specific: delete file from storage using logical message */ + snprintf(prefix, sizeof(prefix), "neon-file:%s", path); + LogLogicalMessage(prefix, NULL, 0, false, true); + } /* * It's not particularly harmful, though strange, if we can't * remove the file here. Don't prevent the checkpoint from diff --git a/src/backend/replication/slot.c b/src/backend/replication/slot.c index a7838751bff..7c5f1ac3ae8 100644 --- a/src/backend/replication/slot.c +++ b/src/backend/replication/slot.c @@ -50,6 +50,7 @@ #include "replication/slotsync.h" #include "replication/slot.h" #include "replication/walsender_private.h" +#include "replication/message.h" #include "storage/fd.h" #include "storage/ipc.h" #include "storage/proc.h" @@ -950,6 +951,15 @@ ReplicationSlotDropPtr(ReplicationSlot *slot) sprintf(path, "%s/%s", PG_REPLSLOT_DIR, NameStr(slot->data.name)); sprintf(tmppath, "%s/%s.tmp", PG_REPLSLOT_DIR, NameStr(slot->data.name)); + if (SlotIsLogical(slot) && XLogInsertAllowed()) + { + /* NEON specific: delete slot from storage using logical message */ + char prefix[MAXPGPATH + sizeof("neon-file:") + sizeof("/state")]; + snprintf(prefix, sizeof(prefix), "neon-file:%s/state", path); + elog(LOG, "Drop replication slot %s", path); + LogLogicalMessage(prefix, NULL, 0, false, true); + } + /* * Rename the slot directory on disk, so that we'll no longer recognize * this as a valid slot. Note that if this fails, we've got to mark the @@ -2324,6 +2334,15 @@ SaveSlotToPath(ReplicationSlot *slot, const char *dir, int elevel) ReplicationSlotOnDiskChecksummedSize); FIN_CRC32C(cp.checksum); + if (SlotIsLogical(slot) && XLogInsertAllowed() && cp.slotdata.restart_lsn != InvalidXLogRecPtr) + { + /* NEON specific: persist slot in storage using logical message */ + char prefix[MAXPGPATH + sizeof("neon-file:")]; + snprintf(prefix, sizeof(prefix), "neon-file:%s", path); + elog(LOG, "Save replication slot at %s restart_lsn=%X/%X", path, LSN_FORMAT_ARGS(cp.slotdata.restart_lsn)); + LogLogicalMessage(prefix, (char*)&cp, sizeof cp, false, true); + } + errno = 0; pgstat_report_wait_start(WAIT_EVENT_REPLICATION_SLOT_WRITE); if ((write(fd, &cp, sizeof(cp))) != sizeof(cp)) From 07f0aa6ceb24267d690ebab4211ee835e25bee71 Mon Sep 17 00:00:00 2001 From: Arseny Sher Date: Wed, 21 Aug 2024 18:33:19 +0200 Subject: [PATCH 16/50] Make logical apply worker take into account syncrep flush position. Without this, it may ack higher than it, losing data on endpoint restart. --- src/backend/replication/logical/worker.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/src/backend/replication/logical/worker.c b/src/backend/replication/logical/worker.c index fd11805a44c..2793b7f81ad 100644 --- a/src/backend/replication/logical/worker.c +++ b/src/backend/replication/logical/worker.c @@ -167,6 +167,7 @@ #include "replication/logicalworker.h" #include "replication/origin.h" #include "replication/walreceiver.h" +#include "replication/walsender_private.h" #include "replication/worker_internal.h" #include "rewrite/rewriteHandler.h" #include "storage/buffile.h" @@ -3491,6 +3492,19 @@ get_flush_position(XLogRecPtr *write, XLogRecPtr *flush, dlist_mutable_iter iter; XLogRecPtr local_flush = GetFlushRecPtr(NULL); + /* + * If synchronous replication is configured, take into account its position. + * This is particularly important in neon where WAL is hardened only after + * it is flushed on majority of safekeepers. + */ + if (SyncRepStandbyNames != NULL && SyncRepStandbyNames[0] != '\0') + { + /* assumes u64 read is atomic */ + XLogRecPtr sync_rep_flush = WalSndCtl->lsn[SYNC_REP_WAIT_FLUSH]; + + local_flush = Min(local_flush, sync_rep_flush); + } + *write = InvalidXLogRecPtr; *flush = InvalidXLogRecPtr; From b0a2bf0a29a5f8469f63ef248363ba1ed7c83fe5 Mon Sep 17 00:00:00 2001 From: Sasha Krassovsky Date: Wed, 21 Aug 2024 18:34:24 +0200 Subject: [PATCH 17/50] NEON: Add hooks to allow on-demand WAL download This improves the scalability of Neon's WAL systems through bypassing the archiving systems, which would require out-of-process WAL requests. --- src/backend/replication/logical/logical.c | 8 ++++++++ src/backend/replication/slotfuncs.c | 1 + src/include/replication/logical.h | 1 + src/include/replication/walsender.h | 1 - 4 files changed, 10 insertions(+), 1 deletion(-) diff --git a/src/backend/replication/logical/logical.c b/src/backend/replication/logical/logical.c index de7c85499bc..348f78bbf9d 100644 --- a/src/backend/replication/logical/logical.c +++ b/src/backend/replication/logical/logical.c @@ -46,6 +46,8 @@ #include "utils/inval.h" #include "utils/memutils.h" +void (*Custom_XLogReaderRoutines)(XLogReaderRoutine *xlr); + /* data for errcontext callback */ typedef struct LogicalErrorCallbackState { @@ -183,6 +185,12 @@ StartupDecodingContext(List *output_plugin_options, if (!fast_forward) LoadOutputPlugin(&ctx->callbacks, NameStr(slot->data.plugin)); + /* + * NEON: override page_read/segment_open/segment_close functions to support on-demand WAL download + */ + if (Custom_XLogReaderRoutines != NULL) + Custom_XLogReaderRoutines(xl_routine); + /* * Now that the slot's xmin has been set, we can announce ourselves as a * logical decoding backend which doesn't need to be checked individually diff --git a/src/backend/replication/slotfuncs.c b/src/backend/replication/slotfuncs.c index 36cc2ed4e44..3190c8d9ad1 100644 --- a/src/backend/replication/slotfuncs.c +++ b/src/backend/replication/slotfuncs.c @@ -24,6 +24,7 @@ #include "utils/guc.h" #include "utils/pg_lsn.h" + /* * Helper function for creating a new physical replication slot with * given arguments. Note that this function doesn't release the created diff --git a/src/include/replication/logical.h b/src/include/replication/logical.h index 2e562bee5a9..9d0acebd36d 100644 --- a/src/include/replication/logical.h +++ b/src/include/replication/logical.h @@ -114,6 +114,7 @@ typedef struct LogicalDecodingContext bool processing_required; } LogicalDecodingContext; +extern void (*Custom_XLogReaderRoutines)(XLogReaderRoutine *xlr); extern void CheckLogicalDecodingRequirements(void); diff --git a/src/include/replication/walsender.h b/src/include/replication/walsender.h index 3d2c7f8df2e..8b475261a6d 100644 --- a/src/include/replication/walsender.h +++ b/src/include/replication/walsender.h @@ -37,7 +37,6 @@ extern PGDLLIMPORT int wal_sender_timeout; extern PGDLLIMPORT bool log_replication_commands; struct XLogReaderRoutine; -extern PGDLLIMPORT void (*WalSender_Custom_XLogReaderRoutines)(struct XLogReaderRoutine *xlr); extern void InitWalSender(void); extern bool exec_replication_command(const char *cmd_string); From 4bfa2a11fd740c56dddf698216ca6a76df9a4aa5 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Wed, 21 Aug 2024 18:37:40 +0200 Subject: [PATCH 18/50] Use smgrexists() to enforce uniqueness of generated relfilenumber Rather than access(), which doesn't work with Neon's non-local relfilenodes. --- src/backend/catalog/catalog.c | 44 +++++++++-------------------------- 1 file changed, 11 insertions(+), 33 deletions(-) diff --git a/src/backend/catalog/catalog.c b/src/backend/catalog/catalog.c index 59caae8f1bc..29944bf153e 100644 --- a/src/backend/catalog/catalog.c +++ b/src/backend/catalog/catalog.c @@ -556,10 +556,10 @@ GetNewOidWithIndex(Relation relation, Oid indexId, AttrNumber oidcolumn) RelFileNumber GetNewRelFileNumber(Oid reltablespace, Relation pg_class, char relpersistence) { - RelFileLocatorBackend rlocator; - RelPathStr rpath; + RelFileLocator locator; bool collides; ProcNumber procNumber; + SMgrRelation srel; /* * If we ever get here during pg_upgrade, there's something wrong; all @@ -583,51 +583,29 @@ GetNewRelFileNumber(Oid reltablespace, Relation pg_class, char relpersistence) } /* This logic should match RelationInitPhysicalAddr */ - rlocator.locator.spcOid = reltablespace ? reltablespace : MyDatabaseTableSpace; - rlocator.locator.dbOid = - (rlocator.locator.spcOid == GLOBALTABLESPACE_OID) ? + locator.spcOid = reltablespace ? reltablespace : MyDatabaseTableSpace; + locator.dbOid = + (locator.spcOid == GLOBALTABLESPACE_OID) ? InvalidOid : MyDatabaseId; - /* - * The relpath will vary based on the backend number, so we must - * initialize that properly here to make sure that any collisions based on - * filename are properly detected. - */ - rlocator.backend = procNumber; - do { CHECK_FOR_INTERRUPTS(); /* Generate the OID */ if (pg_class) - rlocator.locator.relNumber = GetNewOidWithIndex(pg_class, ClassOidIndexId, + locator.relNumber = GetNewOidWithIndex(pg_class, ClassOidIndexId, Anum_pg_class_oid); else - rlocator.locator.relNumber = GetNewObjectId(); + locator.relNumber = GetNewObjectId(); /* Check for existing file of same name */ - rpath = relpath(rlocator, MAIN_FORKNUM); - - if (access(rpath.str, F_OK) == 0) - { - /* definite collision */ - collides = true; - } - else - { - /* - * Here we have a little bit of a dilemma: if errno is something - * other than ENOENT, should we declare a collision and loop? In - * practice it seems best to go ahead regardless of the errno. If - * there is a colliding file we will get an smgr failure when we - * attempt to create the new relation file. - */ - collides = false; - } + srel = smgropen(locator, procNumber); + collides = smgrexists(srel, MAIN_FORKNUM); + smgrclose(srel); } while (collides); - return rlocator.locator.relNumber; + return locator.relNumber; } /* From 63757c70ce6120977e0b502e6b7647a75656aab0 Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Fri, 9 Aug 2024 00:00:02 +0100 Subject: [PATCH 19/50] Remove assertion. TODO figure out why and add a comment --- src/backend/access/transam/xlog.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 79050b7043f..ab252ffba78 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -6838,7 +6838,8 @@ GetRedoStartLsn(void) XLogRecPtr GetFlushRecPtr(TimeLineID *insertTLI) { - Assert(XLogCtl->SharedRecoveryState == RECOVERY_STATE_DONE); + // NEON + //Assert(XLogCtl->SharedRecoveryState == RECOVERY_STATE_DONE); RefreshXLogWriteResult(LogwrtResult); From 879b07a8c4d3839a01540a9b7cc6b89ea4126b94 Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Fri, 9 Aug 2024 22:45:53 +0100 Subject: [PATCH 20/50] fix wal-redo buffer management TODO: the logic around wal_redo + local buffers in other places too. --- src/backend/storage/buffer/bufmgr.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 9fef9979232..c57c260e14d 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -658,7 +658,7 @@ PrefetchBuffer(Relation reln, ForkNumber forkNum, BlockNumber blockNum) Assert(RelationIsValid(reln)); Assert(BlockNumberIsValid(blockNum)); - if (RelationUsesLocalBuffers(reln)) + if (RelationUsesLocalBuffers(reln) || am_wal_redo_postgres) { /* see comments in ReadBufferExtended */ if (RELATION_IS_OTHER_TEMP(reln)) @@ -1057,7 +1057,7 @@ ZeroAndLockBuffer(Buffer buffer, ReadBufferMode mode, bool already_valid) } else if (isLocalBuf) { - Assert(BufferIsLocal(buffer)); + Assert(BufferIsLocal(buffer) || am_wal_redo_postgres); /* Simple case for non-shared buffers. */ bufHdr = GetLocalBufferDescriptor(-buffer - 1); need_to_zero = StartLocalBufferIO(bufHdr, true, false); @@ -1138,7 +1138,7 @@ PinBufferForBlock(Relation rel, persistence == RELPERSISTENCE_PERMANENT || persistence == RELPERSISTENCE_UNLOGGED)); - if (persistence == RELPERSISTENCE_TEMP) + if (persistence == RELPERSISTENCE_TEMP || am_wal_redo_postgres) { io_context = IOCONTEXT_NORMAL; io_object = IOOBJECT_TEMP_RELATION; @@ -1155,7 +1155,7 @@ PinBufferForBlock(Relation rel, smgr->smgr_rlocator.locator.relNumber, smgr->smgr_rlocator.backend); - if (persistence == RELPERSISTENCE_TEMP) + if (persistence == RELPERSISTENCE_TEMP || am_wal_redo_postgres) { bufHdr = LocalBufferAlloc(smgr, forkNum, blockNum, foundPtr); if (*foundPtr) @@ -1652,7 +1652,7 @@ WaitReadBuffers(ReadBuffersOperation *operation) IOContext io_context; IOObject io_object; - if (operation->persistence == RELPERSISTENCE_TEMP) + if (operation->persistence == RELPERSISTENCE_TEMP || am_wal_redo_postgres) { io_context = IOCONTEXT_NORMAL; io_object = IOOBJECT_TEMP_RELATION; @@ -1803,7 +1803,7 @@ AsyncReadBuffers(ReadBuffersOperation *operation, int *nblocks_progress) if (flags & READ_BUFFERS_SYNCHRONOUSLY) ioh_flags |= PGAIO_HF_SYNCHRONOUS; - if (persistence == RELPERSISTENCE_TEMP) + if (persistence == RELPERSISTENCE_TEMP || am_wal_redo_postgres) { io_context = IOCONTEXT_NORMAL; io_object = IOOBJECT_TEMP_RELATION; @@ -1904,7 +1904,7 @@ AsyncReadBuffers(ReadBuffersOperation *operation, int *nblocks_progress) operation->smgr->smgr_rlocator.backend, true); - if (persistence == RELPERSISTENCE_TEMP) + if (persistence == RELPERSISTENCE_TEMP || am_wal_redo_postgres) pgBufferUsage.local_blks_hit += 1; else pgBufferUsage.shared_blks_hit += 1; @@ -1951,7 +1951,7 @@ AsyncReadBuffers(ReadBuffersOperation *operation, int *nblocks_progress) pgaio_io_set_handle_data_32(ioh, (uint32 *) io_buffers, io_buffers_len); pgaio_io_register_callbacks(ioh, - persistence == RELPERSISTENCE_TEMP ? + (persistence == RELPERSISTENCE_TEMP || am_wal_redo_postgres) ? PGAIO_HCB_LOCAL_BUFFER_READV : PGAIO_HCB_SHARED_BUFFER_READV, flags); @@ -1974,7 +1974,7 @@ AsyncReadBuffers(ReadBuffersOperation *operation, int *nblocks_progress) pgstat_count_io_op_time(io_object, io_context, IOOP_READ, io_start, 1, io_buffers_len * BLCKSZ); - if (persistence == RELPERSISTENCE_TEMP) + if (persistence == RELPERSISTENCE_TEMP || am_wal_redo_postgres) pgBufferUsage.local_blks_read += io_buffers_len; else pgBufferUsage.shared_blks_read += io_buffers_len; @@ -2593,7 +2593,7 @@ ExtendBufferedRelCommon(BufferManagerRelation bmr, bmr.smgr->smgr_rlocator.backend, extend_by); - if (bmr.relpersistence == RELPERSISTENCE_TEMP) + if (bmr.relpersistence == RELPERSISTENCE_TEMP || am_wal_redo_postgres) first_block = ExtendBufferedRelLocal(bmr, fork, flags, extend_by, extend_upto, buffers, &extend_by); @@ -4587,7 +4587,7 @@ DropRelationBuffers(SMgrRelation smgr_reln, ForkNumber *forkNum, rlocator = smgr_reln->smgr_rlocator; /* If it's a local relation, it's localbuf.c's problem. */ - if (RelFileLocatorBackendIsTemp(rlocator)) + if (RelFileLocatorBackendIsTemp(rlocator) || am_wal_redo_postgres) { if (rlocator.backend == MyProcNumber) { @@ -4717,7 +4717,7 @@ DropRelationsAllBuffers(SMgrRelation *smgr_reln, int nlocators) /* If it's a local relation, it's localbuf.c's problem. */ for (i = 0; i < nlocators; i++) { - if (RelFileLocatorBackendIsTemp(smgr_reln[i]->smgr_rlocator)) + if (RelFileLocatorBackendIsTemp(smgr_reln[i]->smgr_rlocator) || am_wal_redo_postgres) { if (smgr_reln[i]->smgr_rlocator.backend == MyProcNumber) DropRelationAllLocalBuffers(smgr_reln[i]->smgr_rlocator.locator); @@ -4984,7 +4984,7 @@ FlushRelationBuffers(Relation rel) BufferDesc *bufHdr; SMgrRelation srel = RelationGetSmgr(rel); - if (RelationUsesLocalBuffers(rel)) + if (RelationUsesLocalBuffers(rel) || am_wal_redo_postgres) { for (i = 0; i < NLocBuffer; i++) { From 68a01af4f97e103e411ef6f577c3915cb12c478a Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Wed, 22 Jan 2025 13:25:56 +0000 Subject: [PATCH 21/50] Add new GUC disable_logical_replication_subscribers (#567) --- src/backend/replication/logical/launcher.c | 11 +++++++++++ src/include/replication/logicallauncher.h | 2 ++ 2 files changed, 13 insertions(+) diff --git a/src/backend/replication/logical/launcher.c b/src/backend/replication/logical/launcher.c index 4aed0dfcebb..d26c860d972 100644 --- a/src/backend/replication/logical/launcher.c +++ b/src/backend/replication/logical/launcher.c @@ -51,6 +51,8 @@ int max_logical_replication_workers = 4; int max_sync_workers_per_subscription = 2; int max_parallel_apply_workers_per_subscription = 2; +bool disable_logical_replication_subscribers = false; /* NEON: GUC is defined in neon extension */ + LogicalRepWorker *MyLogicalRepWorker = NULL; typedef struct LogicalRepCtxStruct @@ -343,6 +345,15 @@ logicalrep_worker_launch(LogicalRepWorkerType wtype, (errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED), errmsg("cannot start logical replication workers when \"max_active_replication_origins\" is 0"))); + /* NEON GUC that allows to control logical replication without postgres restart */ + if (disable_logical_replication_subscribers) + { + ereport(WARNING, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("cannot start logical replication workers when disable_logical_replication_subscribers=true"))); + return false; + } + /* * We need to do the modification of the shared memory under lock so that * we have consistent view. diff --git a/src/include/replication/logicallauncher.h b/src/include/replication/logicallauncher.h index 82b202f3305..c9c9b5d30fd 100644 --- a/src/include/replication/logicallauncher.h +++ b/src/include/replication/logicallauncher.h @@ -16,6 +16,8 @@ extern PGDLLIMPORT int max_logical_replication_workers; extern PGDLLIMPORT int max_sync_workers_per_subscription; extern PGDLLIMPORT int max_parallel_apply_workers_per_subscription; +extern PGDLLIMPORT bool disable_logical_replication_subscribers; + extern void ApplyLauncherRegister(void); extern void ApplyLauncherMain(Datum main_arg); From 5f7b1c853841c1e8307339ff5f8c5f02f56396c0 Mon Sep 17 00:00:00 2001 From: alexanderlaw Date: Wed, 29 Jan 2025 12:28:30 +0200 Subject: [PATCH 22/50] Avoid runtime failure in CopyXLogRecordToWAL with sanitizers (cf. 46ab07ff) (#554) --- src/backend/access/transam/xlog.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index ab252ffba78..5e9282f33da 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -1336,7 +1336,8 @@ CopyXLogRecordToWAL(int write_len, bool isLogSwitch, XLogRecData *rdata, } Assert(CurrPos % XLOG_BLCKSZ >= SizeOfXLogShortPHD || rdata_len == 0); - memcpy(currpos, rdata_data, rdata_len); + if (rdata_len > 0) + memcpy(currpos, rdata_data, rdata_len); currpos += rdata_len; CurrPos += rdata_len; freespace -= rdata_len; From 4ac2a9f8671d3a2cba820e18a5559f70c56c7ba1 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Sat, 1 Feb 2025 12:28:18 +0200 Subject: [PATCH 23/50] Add --save-records option to pg_waldump Co-authored-by: Heikki Linnakangas --- src/backend/access/transam/xlogreader.c | 3 + src/bin/pg_waldump/pg_waldump.c | 76 ++++++++++++++++++++++++- src/include/access/xlogreader.h | 1 + 3 files changed, 79 insertions(+), 1 deletion(-) diff --git a/src/backend/access/transam/xlogreader.c b/src/backend/access/transam/xlogreader.c index 72044678f0d..cbe45044f72 100644 --- a/src/backend/access/transam/xlogreader.c +++ b/src/backend/access/transam/xlogreader.c @@ -916,6 +916,9 @@ XLogDecodeNextRecord(XLogReaderState *state, bool nonblocking) SKIP_INVALID_RECORD(RecPtr); } + if (state->force_record_reassemble) + memcpy(state->readRecordBuf, record, total_len); + state->NextRecPtr = RecPtr + MAXALIGN(total_len); state->DecodeRecPtr = RecPtr; diff --git a/src/bin/pg_waldump/pg_waldump.c b/src/bin/pg_waldump/pg_waldump.c index ba5541be988..d53ea40212f 100644 --- a/src/bin/pg_waldump/pg_waldump.c +++ b/src/bin/pg_waldump/pg_waldump.c @@ -47,6 +47,8 @@ static volatile sig_atomic_t time_to_stop = false; static const RelFileLocator emptyRelFileLocator = {0, 0, 0}; +static FILE* save_records_file; + typedef struct XLogDumpPrivate { TimeLineID timeline; @@ -84,6 +86,7 @@ typedef struct XLogDumpConfig /* save options */ char *save_fullpage_path; + char *save_records_file_path; } XLogDumpConfig; @@ -865,11 +868,35 @@ usage(void) printf(_(" -z, --stats[=record] show statistics instead of records\n" " (optionally, show per-record statistics)\n")); printf(_(" --save-fullpage=DIR save full page images to DIR\n")); + printf(_(" --save-records=FILE save selected WAL records to the file\n")); printf(_(" -?, --help show this help, then exit\n")); printf(_("\nReport bugs to <%s>.\n"), PACKAGE_BUGREPORT); printf(_("%s home page: <%s>\n"), PACKAGE_NAME, PACKAGE_URL); } + +static void +write_pq_int32(FILE* f, int32 val) +{ + val = pg_hton32(val); + fwrite(&val, sizeof(val), 1, f); +} + +static void +write_pq_int64(FILE* f, int64 val) +{ + val = pg_hton64(val); + fwrite(&val, sizeof(val), 1, f); +} + +static void +write_pq_message(FILE* f, char tag, uint32 len) +{ + fputc(tag, f); + write_pq_int32(f, len + 4); +} + + int main(int argc, char **argv) { @@ -909,6 +936,7 @@ main(int argc, char **argv) {"version", no_argument, NULL, 'V'}, {"stats", optional_argument, NULL, 'z'}, {"save-fullpage", required_argument, NULL, 1}, + {"save-records", required_argument, NULL, 2}, {NULL, 0, NULL, 0} }; @@ -962,6 +990,7 @@ main(int argc, char **argv) config.filter_by_relation_forknum = InvalidForkNumber; config.filter_by_fpw = false; config.save_fullpage_path = NULL; + config.save_records_file_path = NULL; config.stats = false; config.stats_per_record = false; config.ignore_format_errors = false; @@ -1180,6 +1209,9 @@ main(int argc, char **argv) case 1: config.save_fullpage_path = pg_strdup(optarg); break; + case 2: + config.save_records_file_path = pg_strdup(optarg); + break; default: goto bad_argument; } @@ -1280,6 +1312,9 @@ main(int argc, char **argv) if (config.save_fullpage_path != NULL) create_fullpage_directory(config.save_fullpage_path); + if (config.save_records_file_path) + save_records_file = fopen(config.save_records_file_path, "wb"); + /* parse files as start/end boundaries, extract path if not specified */ if (optind < argc) { @@ -1388,6 +1423,27 @@ main(int argc, char **argv) if (!xlogreader_state) pg_fatal("out of memory while allocating a WAL reading processor"); + if (save_records_file) + { + /* + * NEON: We dump records in the format recognized by walredo process. + * one character tag + 4 bytes length. + * If relation and block number was specified, then BeginRedoForBlock ('B') record is first + * written, containing relation info and block number. If fork is not specified, then main fork is assumed. + * Then it is followed by ApplyRecord ('A') records which specify record LSN and assembled WAL record raw data. + * Finally GetPage ('G') is written to make walredo to return image of the reconstructed page. + */ + if (config.filter_by_relation_enabled && config.filter_by_relation_block_enabled) + { + write_pq_message(save_records_file, 'B', 17); + fputc(config.filter_by_relation_forknum == InvalidForkNumber ? MAIN_FORKNUM : config.filter_by_relation_forknum, save_records_file); + write_pq_int32(save_records_file, config.filter_by_relation.spcOid); + write_pq_int32(save_records_file, config.filter_by_relation.dbOid); + write_pq_int32(save_records_file, config.filter_by_relation.relNumber); + write_pq_int32(save_records_file, config.filter_by_relation_block); + } + xlogreader_state->force_record_reassemble = true; + } if(single_file) { if(config.ignore_format_errors) @@ -1490,7 +1546,12 @@ main(int argc, char **argv) else XLogDumpDisplayRecord(&config, xlogreader_state); } - + if (save_records_file) + { + write_pq_message(save_records_file, 'A', record->xl_tot_len + sizeof(XLogRecPtr)); + write_pq_int64(save_records_file, xlogreader_state->ReadRecPtr); + fwrite(xlogreader_state->readRecordBuf, record->xl_tot_len, 1, save_records_file); + } /* save full pages if requested */ if (config.save_fullpage_path != NULL) XLogRecordSaveFPWs(xlogreader_state, config.save_fullpage_path); @@ -1502,6 +1563,19 @@ main(int argc, char **argv) break; } + if (save_records_file) + { + if (config.filter_by_relation_enabled && config.filter_by_relation_block_enabled) + { + write_pq_message(save_records_file, 'G', 17); + fputc(config.filter_by_relation_forknum == InvalidForkNumber ? MAIN_FORKNUM : config.filter_by_relation_forknum, save_records_file); + write_pq_int32(save_records_file, config.filter_by_relation.spcOid); + write_pq_int32(save_records_file, config.filter_by_relation.dbOid); + write_pq_int32(save_records_file, config.filter_by_relation.relNumber); + write_pq_int32(save_records_file, config.filter_by_relation_block); + } + fclose(save_records_file); + } if (config.stats == true && !config.quiet) XLogDumpDisplayStats(&config, &stats); diff --git a/src/include/access/xlogreader.h b/src/include/access/xlogreader.h index d10dd359f56..fa2f960058a 100644 --- a/src/include/access/xlogreader.h +++ b/src/include/access/xlogreader.h @@ -220,6 +220,7 @@ struct XLogReaderState bool skip_page_validation; bool skip_invalid_records; bool skip_lsn_checks; + bool force_record_reassemble; /* ---------------------------------------- * Decoded representation of current record From 4e8f0631484ff2a841ced30d8ebfc0d489909b04 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Wed, 2 Oct 2024 09:43:03 +0300 Subject: [PATCH 24/50] Persist pg_stat file using AUX mechanism The feature is disabled by default, but can be enabled by setting the neon_pgstat_file_size_limit integer limit to a value that's not 0 (disabled) Co-authored-by: Heikki Linnakangas --- src/backend/access/heap/rewriteheap.c | 43 +---------- src/backend/access/transam/xlog.c | 7 ++ src/backend/replication/logical/message.c | 87 +++++++++++++++++++++++ src/backend/utils/activity/pgstat.c | 10 ++- src/include/pgstat.h | 3 +- src/include/replication/message.h | 4 ++ 6 files changed, 110 insertions(+), 44 deletions(-) diff --git a/src/backend/access/heap/rewriteheap.c b/src/backend/access/heap/rewriteheap.c index be36907e852..d8996cf2aea 100644 --- a/src/backend/access/heap/rewriteheap.c +++ b/src/backend/access/heap/rewriteheap.c @@ -752,41 +752,6 @@ raw_heap_insert(RewriteState state, HeapTuple tup) * ------------------------------------------------------------------------ */ -/* - * NEON: we need to persist mapping file in WAL - */ -static void -wallog_mapping_file(char const* path, int fd) -{ - char prefix[MAXPGPATH]; - - /* Do not wallog AUX file at replica */ - if (!XLogInsertAllowed()) - return; - - snprintf(prefix, sizeof(prefix), "neon-file:%s", path); - if (fd < 0) - { - elog(DEBUG1, "neon: deleting contents of rewrite file %s", path); - /* unlink file */ - LogLogicalMessage(prefix, NULL, 0, false, false); - } - else - { - off_t size = lseek(fd, 0, SEEK_END); - char* buf; - elog(DEBUG1, "neon: writing contents of rewrite file %s, size %ld", path, (long)size); - if (size < 0) - elog(ERROR, "Failed to get size of mapping file: %m"); - buf = palloc((size_t)size); - lseek(fd, 0, SEEK_SET); - if (read(fd, buf, (size_t)size) != size) - elog(ERROR, "Failed to read mapping file: %m"); - LogLogicalMessage(prefix, buf, (size_t)size, false, false); - pfree(buf); - } -} - /* * Do preparations for logging logical mappings during a rewrite if * necessary. If we detect that we don't need to log anything we'll prevent @@ -922,7 +887,7 @@ logical_heap_rewrite_flush_mappings(RewriteState state) errmsg("could not write to file \"%s\", wrote %d of %d: %m", src->path, written, len))); src->off += len; - wallog_mapping_file(src->path, FileGetRawDesc(src->vfd)); + wallog_file_descriptor(src->path, FileGetRawDesc(src->vfd), PG_UINT64_MAX); XLogBeginInsert(); XLogRegisterData(&xlrec, sizeof(xlrec)); @@ -1173,8 +1138,6 @@ heap_xlog_logical_rewrite(XLogReaderState *r) errmsg("could not fsync file \"%s\": %m", path))); pgstat_report_wait_end(); - wallog_mapping_file(path, fd); - if (CloseTransientFile(fd) != 0) ereport(ERROR, (errcode_for_file_access(), @@ -1252,7 +1215,7 @@ CheckPointLogicalRewriteHeap(void) ereport(ERROR, (errcode_for_file_access(), errmsg("could not remove file \"%s\": %m", path))); - wallog_mapping_file(path, -1); + wallog_file_removal(path); } else { @@ -1281,7 +1244,7 @@ CheckPointLogicalRewriteHeap(void) errmsg("could not fsync file \"%s\": %m", path))); pgstat_report_wait_end(); - wallog_mapping_file(path, fd); + wallog_file_descriptor(path, fd, PG_UINT64_MAX); if (CloseTransientFile(fd) != 0) ereport(ERROR, diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 5e9282f33da..ec52d5e9190 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -7874,6 +7874,13 @@ PreCheckPointGuts(int flags) { CheckPointReplicationState(flags); CheckPointBuffers(flags); + + /* + * pgstat_write_statsfile will be called later by before_shmem_exit() hook, but by then it's too late + * to write WAL records. In Neon, pgstat_write_statsfile() writes the pgstats file to the WAL, so we have + * to call it earlier. (The call that happens later is useless, but it doesn't do any harm either) + */ + pgstat_write_statsfile(); } } diff --git a/src/backend/replication/logical/message.c b/src/backend/replication/logical/message.c index ebc8454bad9..7657b6a6eab 100644 --- a/src/backend/replication/logical/message.c +++ b/src/backend/replication/logical/message.c @@ -31,10 +31,14 @@ #include "postgres.h" +#include +#include + #include "access/xact.h" #include "access/xloginsert.h" #include "miscadmin.h" #include "replication/message.h" +#include "storage/fd.h" /* * Write logical decoding message into XLog. @@ -93,3 +97,86 @@ logicalmsg_redo(XLogReaderState *record) /* This is only interesting for logical decoding, see decode.c. */ } + +/* + * NEON: remove AUX object + */ +void +wallog_file_removal(char const* path) +{ + char prefix[MAXPGPATH]; + + /* Do not wallog AUX file at replica */ + if (!XLogInsertAllowed()) + return; + + snprintf(prefix, sizeof(prefix), "neon-file:%s", path); + elog(DEBUG1, "neon: deleting contents of file %s", path); + + /* unlink file */ + LogLogicalMessage(prefix, NULL, 0, false, true); +} + +/* + * NEON: persist file in WAL to save it in persistent storage. + */ +void +wallog_file_descriptor(char const* path, int fd, uint64_t limit) +{ + char prefix[MAXPGPATH]; + off_t size; + struct stat stat; + + Assert(fd >= 0); + + /* Do not wallog AUX file at replica */ + if (!XLogInsertAllowed()) + return; + + if (fstat(fd, &stat)) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not stat file \"%s\": %m", path))); + size = stat.st_size; + + if (size < 0) + elog(ERROR, "Failed to get size of file %s: %m", path); + elog(DEBUG1, "neon: writing contents of file %s, size %ld", path, (long)size); + + if ((uint64_t)size > limit) + { + elog(WARNING, "Size of file %s %ld is larger than limit %ld", path, (long)size, (long)limit); + wallog_file_removal(path); + } + else + { + char* buf = palloc((size_t)size); /* palloc is not able to allocate more than MaxAllocSize (1Gb) but we don't expect such large AUX files */ + size_t offs = 0; + while (offs < size) { + ssize_t rc = pread(fd, buf + offs, (size_t)size - offs, offs); + if (rc <= 0) + elog(ERROR, "Failed to read file %s: %m", path); + offs += rc; + } + snprintf(prefix, sizeof(prefix), "neon-file:%s", path); + LogLogicalMessage(prefix, buf, (size_t)size, false, true); + pfree(buf); + } +} + +void +wallog_file(char const* path, uint64_t limit) +{ + int fd = OpenTransientFile(path, O_RDONLY | PG_BINARY); + if (fd < 0) + { + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not open file \"%s\": %m", path))); + } + else + { + wallog_file_descriptor(path, fd, limit); + CloseTransientFile(fd); + } +} diff --git a/src/backend/utils/activity/pgstat.c b/src/backend/utils/activity/pgstat.c index 6bc91ce0dad..36664dd2d38 100644 --- a/src/backend/utils/activity/pgstat.c +++ b/src/backend/utils/activity/pgstat.c @@ -106,6 +106,7 @@ #include "access/xact.h" #include "lib/dshash.h" #include "pgstat.h" +#include "replication/message.h" #include "storage/fd.h" #include "storage/ipc.h" #include "storage/lwlock.h" @@ -180,7 +181,6 @@ typedef struct PgStat_SnapshotEntry * ---------- */ -static void pgstat_write_statsfile(void); static void pgstat_read_statsfile(void); static void pgstat_init_snapshot_fixed(void); @@ -203,7 +203,7 @@ static inline bool pgstat_is_kind_valid(PgStat_Kind kind); bool pgstat_track_counts = false; int pgstat_fetch_consistency = PGSTAT_FETCH_CONSISTENCY_CACHE; - +int neon_pgstat_file_size_limit; /* ---------- * state shared with pgstat_*.c @@ -1566,7 +1566,7 @@ write_chunk(FILE *fpout, void *ptr, size_t len) * This function is called in the last process that is accessing the shared * stats so locking is not required. */ -static void +void pgstat_write_statsfile(void) { FILE *fpout; @@ -1733,6 +1733,10 @@ pgstat_write_statsfile(void) /* durable_rename already emitted log message */ unlink(tmpfile); } + else if (XLogInsertAllowed() && neon_pgstat_file_size_limit != 0) + { + wallog_file(statfile, (uint64_t) neon_pgstat_file_size_limit * 1024); + } } /* helpers for pgstat_read_statsfile() */ diff --git a/src/include/pgstat.h b/src/include/pgstat.h index 378f2f2c2ba..0a62fb39fe7 100644 --- a/src/include/pgstat.h +++ b/src/include/pgstat.h @@ -516,6 +516,7 @@ extern void StatsShmemInit(void); extern void pgstat_restore_stats(void); extern void pgstat_discard_stats(void); extern void pgstat_before_server_shutdown(int code, Datum arg); +extern void pgstat_write_statsfile(void); /* Functions for backend initialization */ extern void pgstat_initialize(void); @@ -799,7 +800,7 @@ extern PgStat_WalStats *pgstat_fetch_stat_wal(void); extern PGDLLIMPORT bool pgstat_track_counts; extern PGDLLIMPORT int pgstat_track_functions; extern PGDLLIMPORT int pgstat_fetch_consistency; - +extern PGDLLIMPORT int neon_pgstat_file_size_limit; /* * Variables in pgstat_bgwriter.c diff --git a/src/include/replication/message.h b/src/include/replication/message.h index 8a0893ea539..bb50ec9c551 100644 --- a/src/include/replication/message.h +++ b/src/include/replication/message.h @@ -33,6 +33,10 @@ extern XLogRecPtr LogLogicalMessage(const char *prefix, const char *message, size_t size, bool transactional, bool flush); +extern void wallog_file(char const* path, uint64_t limit); +extern void wallog_file_descriptor(char const* path, int fd, uint64_t limit); +extern void wallog_file_removal(char const* path); + /* RMGR API */ #define XLOG_LOGICAL_MESSAGE 0x00 extern void logicalmsg_redo(XLogReaderState *record); From 6242c428924f5bd370128de16774e1cea6ecaa0c Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Tue, 4 Mar 2025 21:50:29 +0000 Subject: [PATCH 25/50] Skip dropping tablesync replication slots on the publisher (#594) when we're in the process of dropping subscriptions inherited by a neon branch. Because these slots are still needed by the parent branch subscriptions. For regular slots we handle this by setting the slot_name to NONE before calling DROP SUBSCRIPTION, but tablesync slots are not exposed to SQL. rely on GUC disable_logical_replication_subscribers=true to know that we're in the Neon-specific process of dropping subscriptions. --- src/backend/commands/subscriptioncmds.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/src/backend/commands/subscriptioncmds.c b/src/backend/commands/subscriptioncmds.c index 4ff246cd943..72344761b24 100644 --- a/src/backend/commands/subscriptioncmds.c +++ b/src/backend/commands/subscriptioncmds.c @@ -1880,7 +1880,17 @@ DropSubscription(DropSubscriptionStmt *stmt, bool isTopLevel) ReplicationSlotNameForTablesync(subid, relid, syncslotname, sizeof(syncslotname)); - ReplicationSlotDropAtPubNode(wrconn, syncslotname, true); + + if (disable_logical_replication_subscribers) + { + ereport(LOG, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("skip dropping tablesync slot \"%s\" when disable_logical_replication_subscribers=true", syncslotname))); + } + else + { + ReplicationSlotDropAtPubNode(wrconn, syncslotname, true); + } } } From 3006ef186afcae4c5f0906adabc137b2466bee5b Mon Sep 17 00:00:00 2001 From: Mark Novikov Date: Thu, 1 May 2025 17:24:24 +0400 Subject: [PATCH 26/50] NEON: Add option to skip event triggers in pg_dump --- doc/src/sgml/ref/pg_dump.sgml | 9 +++++++++ doc/src/sgml/ref/pg_dumpall.sgml | 9 +++++++++ doc/src/sgml/ref/pg_restore.sgml | 10 ++++++++++ src/bin/pg_dump/pg_backup.h | 2 ++ src/bin/pg_dump/pg_backup_archiver.c | 5 +++++ src/bin/pg_dump/pg_dump.c | 22 ++++++++++++++++++++++ src/bin/pg_dump/pg_dumpall.c | 5 +++++ src/bin/pg_dump/pg_restore.c | 4 ++++ src/bin/pg_dump/t/002_pg_dump.pl | 12 ++++++++++++ 9 files changed, 78 insertions(+) diff --git a/doc/src/sgml/ref/pg_dump.sgml b/doc/src/sgml/ref/pg_dump.sgml index 0bc7609bdf8..421e70fc20a 100644 --- a/doc/src/sgml/ref/pg_dump.sgml +++ b/doc/src/sgml/ref/pg_dump.sgml @@ -1148,6 +1148,15 @@ PostgreSQL documentation + + + + + Do not dump event triggers. + + + + diff --git a/doc/src/sgml/ref/pg_dumpall.sgml b/doc/src/sgml/ref/pg_dumpall.sgml index 364442f00f2..b77818f7c8c 100644 --- a/doc/src/sgml/ref/pg_dumpall.sgml +++ b/doc/src/sgml/ref/pg_dumpall.sgml @@ -501,6 +501,15 @@ exclude database PATTERN + + + + + Do not dump event triggers. + + + + diff --git a/doc/src/sgml/ref/pg_restore.sgml b/doc/src/sgml/ref/pg_restore.sgml index 261ead15039..c4cc8455e13 100644 --- a/doc/src/sgml/ref/pg_restore.sgml +++ b/doc/src/sgml/ref/pg_restore.sgml @@ -774,6 +774,16 @@ PostgreSQL documentation + + + + + Do not output commands to restore event triggers, even if the archive + contains them. + + + + diff --git a/src/bin/pg_dump/pg_backup.h b/src/bin/pg_dump/pg_backup.h index 4ebef1e8644..b2648692165 100644 --- a/src/bin/pg_dump/pg_backup.h +++ b/src/bin/pg_dump/pg_backup.h @@ -99,6 +99,7 @@ typedef struct _restoreOptions int noOwner; /* Don't try to match original object owner */ int noTableAm; /* Don't issue table-AM-related commands */ int noTablespace; /* Don't issue tablespace-related commands */ + int no_event_triggers; /* Don't issue event-trigger-related commands */ int disable_triggers; /* disable triggers during data-only * restore */ int use_setsessauth; /* Use SET SESSION AUTHORIZATION commands @@ -192,6 +193,7 @@ typedef struct _dumpOptions int disable_triggers; int outputNoTableAm; int outputNoTablespaces; + int no_event_triggers; int use_setsessauth; int enable_row_security; int load_via_partition_root; diff --git a/src/bin/pg_dump/pg_backup_archiver.c b/src/bin/pg_dump/pg_backup_archiver.c index e0b9ea0259f..229d5b1f3b9 100644 --- a/src/bin/pg_dump/pg_backup_archiver.c +++ b/src/bin/pg_dump/pg_backup_archiver.c @@ -192,6 +192,7 @@ dumpOptionsFromRestoreOptions(RestoreOptions *ropt) dopt->no_publications = ropt->no_publications; dopt->no_security_labels = ropt->no_security_labels; dopt->no_subscriptions = ropt->no_subscriptions; + dopt->no_event_triggers = ropt->no_event_triggers; dopt->lockWaitTimeout = ropt->lockWaitTimeout; dopt->include_everything = ropt->include_everything; dopt->enable_row_security = ropt->enable_row_security; @@ -3032,6 +3033,10 @@ _tocEntryRequired(TocEntry *te, teSection curSection, ArchiveHandle *AH) if (ropt->no_subscriptions && strcmp(te->desc, "SUBSCRIPTION") == 0) return 0; + /* If it's an event trigger, maybe ignore it */ + if (ropt->no_event_triggers && strcmp(te->desc, "EVENT TRIGGER") == 0) + return 0; + /* Ignore it if section is not to be dumped/restored */ switch (curSection) { diff --git a/src/bin/pg_dump/pg_dump.c b/src/bin/pg_dump/pg_dump.c index f867a1c0cbe..9c8d3adffa7 100644 --- a/src/bin/pg_dump/pg_dump.c +++ b/src/bin/pg_dump/pg_dump.c @@ -516,6 +516,7 @@ main(int argc, char **argv) {"no-security-labels", no_argument, &dopt.no_security_labels, 1}, {"no-statistics", no_argument, NULL, 21}, {"no-subscriptions", no_argument, &dopt.no_subscriptions, 1}, + {"no-event-triggers", no_argument, &dopt.no_event_triggers, 1}, {"no-toast-compression", no_argument, &dopt.no_toast_compression, 1}, {"no-unlogged-table-data", no_argument, &dopt.no_unlogged_table_data, 1}, {"no-sync", no_argument, NULL, 7}, @@ -1186,6 +1187,7 @@ main(int argc, char **argv) ropt->no_publications = dopt.no_publications; ropt->no_security_labels = dopt.no_security_labels; ropt->no_subscriptions = dopt.no_subscriptions; + ropt->no_event_triggers = dopt.no_event_triggers; ropt->lockWaitTimeout = dopt.lockWaitTimeout; ropt->include_everything = dopt.include_everything; ropt->enable_row_security = dopt.enable_row_security; @@ -1296,6 +1298,7 @@ help(const char *progname) printf(_(" --no-security-labels do not dump security label assignments\n")); printf(_(" --no-statistics do not dump statistics\n")); printf(_(" --no-subscriptions do not dump subscriptions\n")); + printf(_(" --no-event-triggers do not dump event triggers\n")); printf(_(" --no-table-access-method do not dump table access methods\n")); printf(_(" --no-tablespaces do not dump tablespace assignments\n")); printf(_(" --no-toast-compression do not dump TOAST compression methods\n")); @@ -6753,6 +6756,15 @@ getFuncs(Archive *fout) * pg_catalog if they have an ACL different from what's shown in * pg_init_privs (so we have to join to pg_init_privs; annoying). */ + + /* + * If --no-event-triggers is specified, exclude functions that return + * event triggers. + */ + const char *not_event_trigger_check; + + not_event_trigger_check = (fout->dopt->no_event_triggers ? "\n AND p.prorettype <> 'pg_catalog.event_trigger'::regtype" : " "); + if (fout->remoteVersion >= 90600) { const char *not_agg_check; @@ -6773,6 +6785,7 @@ getFuncs(Archive *fout) "AND pip.classoid = 'pg_proc'::regclass " "AND pip.objsubid = 0) " "WHERE %s" + "%s" "\n AND NOT EXISTS (SELECT 1 FROM pg_depend " "WHERE classid = 'pg_proc'::regclass AND " "objid = p.oid AND deptype = 'i')" @@ -6788,6 +6801,7 @@ getFuncs(Archive *fout) "\n (p.oid = pg_transform.trffromsql" "\n OR p.oid = pg_transform.trftosql))", not_agg_check, + not_event_trigger_check, g_last_builtin_oid, g_last_builtin_oid); if (dopt->binary_upgrade) @@ -6811,6 +6825,7 @@ getFuncs(Archive *fout) "proowner " "FROM pg_proc p " "WHERE NOT proisagg" + "%s" "\n AND NOT EXISTS (SELECT 1 FROM pg_depend " "WHERE classid = 'pg_proc'::regclass AND " "objid = p.oid AND deptype = 'i')" @@ -6821,6 +6836,7 @@ getFuncs(Archive *fout) "\n OR EXISTS (SELECT 1 FROM pg_cast" "\n WHERE pg_cast.oid > '%u'::oid" "\n AND p.oid = pg_cast.castfunc)", + not_event_trigger_check, g_last_builtin_oid); if (fout->remoteVersion >= 90500) @@ -8697,6 +8713,12 @@ getEventTriggers(Archive *fout) if (fout->remoteVersion < 90300) return; + if (fout->dopt->no_event_triggers) + { + pg_log_info("skipping event triggers"); + return; + } + query = createPQExpBuffer(); appendPQExpBufferStr(query, diff --git a/src/bin/pg_dump/pg_dumpall.c b/src/bin/pg_dump/pg_dumpall.c index 27aa1b65698..06c1bf10b8e 100644 --- a/src/bin/pg_dump/pg_dumpall.c +++ b/src/bin/pg_dump/pg_dumpall.c @@ -103,6 +103,7 @@ static int no_schema = 0; static int no_statistics = 0; static int no_subscriptions = 0; static int no_toast_compression = 0; +static int no_event_triggers = 0; static int no_unlogged_table_data = 0; static int no_role_passwords = 0; static int with_statistics = 0; @@ -168,6 +169,7 @@ main(int argc, char *argv[]) {"use-set-session-authorization", no_argument, &use_setsessauth, 1}, {"no-comments", no_argument, &no_comments, 1}, {"no-data", no_argument, &no_data, 1}, + {"no-event-triggers", no_argument, &no_event_triggers, 1}, {"no-policies", no_argument, &no_policies, 1}, {"no-publications", no_argument, &no_publications, 1}, {"no-role-passwords", no_argument, &no_role_passwords, 1}, @@ -467,6 +469,8 @@ main(int argc, char *argv[]) appendPQExpBufferStr(pgdumpopts, " --no-statistics"); if (no_subscriptions) appendPQExpBufferStr(pgdumpopts, " --no-subscriptions"); + if (no_event_triggers) + appendPQExpBufferStr(pgdumpopts, " --no-event-triggers"); if (no_toast_compression) appendPQExpBufferStr(pgdumpopts, " --no-toast-compression"); if (no_unlogged_table_data) @@ -695,6 +699,7 @@ help(void) printf(_(" --no-security-labels do not dump security label assignments\n")); printf(_(" --no-statistics do not dump statistics\n")); printf(_(" --no-subscriptions do not dump subscriptions\n")); + printf(_(" --no-event-triggers do not dump event triggers\n")); printf(_(" --no-sync do not wait for changes to be written safely to disk\n")); printf(_(" --no-table-access-method do not dump table access methods\n")); printf(_(" --no-tablespaces do not dump tablespace assignments\n")); diff --git a/src/bin/pg_dump/pg_restore.c b/src/bin/pg_dump/pg_restore.c index 6c129278bc5..6e5a5132936 100644 --- a/src/bin/pg_dump/pg_restore.c +++ b/src/bin/pg_dump/pg_restore.c @@ -80,6 +80,7 @@ main(int argc, char **argv) static int no_security_labels = 0; static int no_statistics = 0; static int no_subscriptions = 0; + static int no_event_triggers = 0; static int strict_names = 0; static int statistics_only = 0; static int with_statistics = 0; @@ -131,6 +132,7 @@ main(int argc, char **argv) {"use-set-session-authorization", no_argument, &use_setsessauth, 1}, {"no-comments", no_argument, &no_comments, 1}, {"no-data", no_argument, &no_data, 1}, + {"no-event-triggers", no_argument, &no_event_triggers, 1}, {"no-policies", no_argument, &no_policies, 1}, {"no-publications", no_argument, &no_publications, 1}, {"no-schema", no_argument, &no_schema, 1}, @@ -421,6 +423,7 @@ main(int argc, char **argv) opts->no_publications = no_publications; opts->no_security_labels = no_security_labels; opts->no_subscriptions = no_subscriptions; + opts->no_event_triggers = no_event_triggers; if (if_exists && !opts->dropSchema) pg_fatal("option --if-exists requires option -c/--clean"); @@ -544,6 +547,7 @@ usage(const char *progname) printf(_(" --no-security-labels do not restore security labels\n")); printf(_(" --no-statistics do not restore statistics\n")); printf(_(" --no-subscriptions do not restore subscriptions\n")); + printf(_(" --no-event-triggers do not restore event triggers\n")); printf(_(" --no-table-access-method do not restore table access methods\n")); printf(_(" --no-tablespaces do not restore tablespace assignments\n")); printf(_(" --section=SECTION restore named section (pre-data, data, or post-data)\n")); diff --git a/src/bin/pg_dump/t/002_pg_dump.pl b/src/bin/pg_dump/t/002_pg_dump.pl index 1aea8a25d5e..58b61e739e4 100644 --- a/src/bin/pg_dump/t/002_pg_dump.pl +++ b/src/bin/pg_dump/t/002_pg_dump.pl @@ -632,6 +632,13 @@ 'postgres', ], }, + no_event_triggers => { + dump_cmd => [ + 'pg_dump', '--no-sync', + "--file=$tempdir/no_event_triggers.sql", + '--no-event-triggers', 'postgres', + ], + }, no_privs => { dump_cmd => [ 'pg_dump', '--no-sync', @@ -869,6 +876,7 @@ exclude_measurement_data => 1, no_toast_compression => 1, no_large_objects => 1, + no_event_triggers => 1, no_owner => 1, no_policies => 1, no_privs => 1, @@ -2471,6 +2479,7 @@ unlike => { exclude_dump_test_schema => 1, only_dump_measurement => 1, + no_event_triggers => 1, }, }, @@ -2584,6 +2593,9 @@ \n\s+\QEXECUTE FUNCTION dump_test.event_trigger_func();\E /xm, like => { %full_runs, section_post_data => 1, }, + unlike => { + no_event_triggers => 1, + }, }, 'CREATE TRIGGER test_trigger' => { From da4d53d355dcce2c3cfc1d04b7822b603d08e49a Mon Sep 17 00:00:00 2001 From: Matthias van de Meent Date: Tue, 30 Jul 2024 22:22:29 +0200 Subject: [PATCH 27/50] NEON: Dynamic installation of extension's libraries. If an extension's control file is present, but not all the files that we expected, we'll notify a sidecar process to download and install these files, after which we'll try accessing again. This allows faster starts and dynamic installation of various files without impacting the cold start time as much, as most extensions are not accessed during or immediately after startup. Author: Anastasia Lubennikova Author: Tristan Partin --- src/backend/commands/extension.c | 53 +++++++++++++++- src/backend/utils/fmgr/dfmgr.c | 101 +++++++++++++++++++++++++++++-- src/include/fmgr.h | 6 ++ 3 files changed, 153 insertions(+), 7 deletions(-) diff --git a/src/backend/commands/extension.c b/src/backend/commands/extension.c index e6f9ab6dfd6..c85f2936021 100644 --- a/src/backend/commands/extension.c +++ b/src/backend/commands/extension.c @@ -437,6 +437,9 @@ find_extension_control_filename(ExtensionControlFile *control) static char * get_extension_script_directory(ExtensionControlFile *control) { + char *result; + struct stat fst; + /* * The directory parameter can be omitted, absolute, or relative to the * installation's base directory, which can be the sharedir or a custom @@ -444,13 +447,32 @@ get_extension_script_directory(ExtensionControlFile *control) * .control file was found. */ if (!control->directory) - return pstrdup(control->control_dir); + { + result = pstrdup(control->control_dir); + goto check_result_and_return; + } if (is_absolute_path(control->directory)) - return pstrdup(control->directory); + { + result = pstrdup(control->directory); + goto check_result_and_return; + } Assert(control->basedir != NULL); - return psprintf("%s/%s", control->basedir, control->directory); + result = psprintf("%s/%s", control->basedir, control->directory); + +check_result_and_return: + /* NEON: If directory does not exist, check remote extension storage */ + if (stat(result, &fst) < 0) + { + /* request download of extension files from for control->directory */ + if (download_extension_file_hook != NULL) + { + download_extension_file_hook(control->directory, false); + } + } + + return result; } static char * @@ -1466,9 +1488,11 @@ identify_update_path(ExtensionControlFile *control, { List *result; List *evi_list; + bool attempted_download = false; ExtensionVersionInfo *evi_start; ExtensionVersionInfo *evi_target; +reidentify: /* Extract the version update graph from the script directory */ evi_list = get_ext_ver_list(control); @@ -1479,6 +1503,16 @@ identify_update_path(ExtensionControlFile *control, /* Find shortest path */ result = find_update_path(evi_list, evi_start, evi_target, false, false); + /* Before we report an ERROR, try to download a remote extension */ + if (result == NIL && download_extension_file_hook && !attempted_download) + { + attempted_download = true; + download_extension_file_hook(control->name, false); + + /* Try again to find the shortest path */ + goto reidentify; + } + if (result == NIL) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), @@ -1701,6 +1735,7 @@ CreateExtensionInternal(char *extensionName, * will get us there. */ filename = get_extension_script_filename(pcontrol, NULL, versionName); + if (stat(filename, &fst) == 0) { /* Easy, no extra scripts */ @@ -1712,7 +1747,9 @@ CreateExtensionInternal(char *extensionName, List *evi_list; ExtensionVersionInfo *evi_start; ExtensionVersionInfo *evi_target; + bool attempted_download = false; + reidentify: /* Extract the version update graph from the script directory */ evi_list = get_ext_ver_list(pcontrol); @@ -1723,6 +1760,16 @@ CreateExtensionInternal(char *extensionName, evi_start = find_install_path(evi_list, evi_target, &updateVersions); + /* Before we report an ERROR, try to download a remote extension */ + if (evi_start == NULL && download_extension_file_hook && !attempted_download) + { + attempted_download = true; + download_extension_file_hook(extensionName, false); + + /* Try again to find the install path */ + goto reidentify; + } + /* Fail if no path ... */ if (evi_start == NULL) ereport(ERROR, diff --git a/src/backend/utils/fmgr/dfmgr.c b/src/backend/utils/fmgr/dfmgr.c index 4bb84ff7087..b63806af3ae 100644 --- a/src/backend/utils/fmgr/dfmgr.c +++ b/src/backend/utils/fmgr/dfmgr.c @@ -27,6 +27,7 @@ #include "storage/shmem.h" #include "utils/hsearch.h" +download_extension_file_hook_type download_extension_file_hook = NULL; /* signature for PostgreSQL-specific library init function */ typedef void (*PG_init_t) (void); @@ -71,9 +72,12 @@ char *Dynamic_library_path; static void *internal_load_library(const char *libname); pg_noreturn static void incompatible_module_error(const char *libname, const Pg_abi_values *module_magic_data); -static char *expand_dynamic_library_name(const char *name); +static char *expand_dynamic_library_name(const char *name, bool *is_found); static void check_restricted_library_name(const char *name); +/* Try loading the extension data that contains given file name*/ +static void neon_try_load(const char *filename); + /* ABI values that module needs to match to be accepted */ static const Pg_abi_values magic_data = PG_MODULE_ABI_DATA; @@ -98,6 +102,7 @@ load_external_function(const char *filename, const char *funcname, char *fullname; void *lib_handle; void *retval; + bool is_found = true; /* * If the value starts with "$libdir/", strip that. This is because many @@ -108,7 +113,17 @@ load_external_function(const char *filename, const char *funcname, filename += 8; /* Expand the possibly-abbreviated filename to an exact path name */ - fullname = expand_dynamic_library_name(filename); + fullname = expand_dynamic_library_name(filename, &is_found); + + // if file is not found, try to download it from compute_ctl + if (!is_found && download_extension_file_hook != NULL) + { + // try to download the file + elog(DEBUG3, "load_external_function: try to download file: %s", fullname); + neon_try_load(fullname); + // try to find file locally once again + fullname = expand_dynamic_library_name(filename, &is_found); + } /* Load the shared library, unless we already did */ lib_handle = internal_load_library(fullname); @@ -130,6 +145,72 @@ load_external_function(const char *filename, const char *funcname, return retval; } +void +neon_try_load(const char *filename) +{ + bool have_slash; + char *request_name; + size_t pkglib_path_len; + +#define LIBDIR_PLACEHOLDER_LEN (sizeof("$libdir") - 1) + + pkglib_path_len = strlen(pkglib_path); + + /* add .so suffix if it is not present */ + if (strstr(filename, DLSUFFIX) == NULL) + { + request_name = psprintf("%s%s", filename, DLSUFFIX); + elog(DEBUG3, "neon_try_load: add DLSUFFIX: %s", request_name); + } + else + { + request_name = pstrdup(filename); + elog(DEBUG3, "neon_try_load: DLSUFFIX already present: %s", request_name); + } + + have_slash = (first_dir_separator(request_name) != NULL); + + elog(DEBUG3, "neon_try_load: request_name: %s, pkglib_path %s", request_name, pkglib_path); + + if (strncmp(request_name, "$libdir/", LIBDIR_PLACEHOLDER_LEN + 1) == 0) + { + char *new_request_name = psprintf("%s", request_name + LIBDIR_PLACEHOLDER_LEN + 1); + pfree(request_name); + request_name = new_request_name; + + elog(DEBUG3, "neon_try_load: omit $libdir/: %s", request_name); + } + /* if name contains pkglib_path as prefix, strip it and only request the file name */ + else if (pkglib_path[0] != '\0' && + strncmp(request_name, pkglib_path, pkglib_path_len) == 0) + { + bool trailing_slash; + char *new_request_name; + + /* If there is a directory separator, it should be removed. */ + trailing_slash = request_name[pkglib_path_len] == '/'; + + new_request_name = psprintf("%s", request_name + pkglib_path_len + (int)trailing_slash); + pfree(request_name); + request_name = new_request_name; + + elog(DEBUG3, "neon_try_load: omit pkglib_path: %s", request_name); + } + else if (have_slash) + { + ereport(ERROR, + (errcode(ERRCODE_INVALID_NAME), + errmsg("unexpected path in dynamic library name: %s", + filename))); + } + + elog(DEBUG3, "neon_try_load: final request_name: %s", request_name); + + if (download_extension_file_hook) + download_extension_file_hook(request_name, true); +#undef LIBDIR_PLACEHOLDER_LEN +} + /* * This function loads a shlib file without looking up any particular * function in it. If the same shlib has previously been loaded, @@ -142,13 +223,24 @@ void load_file(const char *filename, bool restricted) { char *fullname; + bool is_found = true; /* Apply security restriction if requested */ if (restricted) check_restricted_library_name(filename); /* Expand the possibly-abbreviated filename to an exact path name */ - fullname = expand_dynamic_library_name(filename); + fullname = expand_dynamic_library_name(filename, &is_found); + + // if file is not found, try to download it from compute_ctl + if (!is_found && download_extension_file_hook != NULL) + { + // try to download the file + elog(DEBUG3, "load_file: try to download file: %s", fullname); + neon_try_load(fullname); + // try to find file locally once again + fullname = expand_dynamic_library_name(filename, &is_found); + } /* Load the shared library, unless we already did */ (void) internal_load_library(fullname); @@ -456,7 +548,7 @@ get_loaded_module_details(DynamicFileList *dfptr, * The result will always be freshly palloc'd. */ static char * -expand_dynamic_library_name(const char *name) +expand_dynamic_library_name(const char *name, bool *is_found) { bool have_slash; char *new; @@ -502,6 +594,7 @@ expand_dynamic_library_name(const char *name) * If we can't find the file, just return the string as-is. The ensuing * load attempt will fail and report a suitable message. */ + *is_found = false; return pstrdup(name); } diff --git a/src/include/fmgr.h b/src/include/fmgr.h index 0fe7b4ebc77..95e7cfb1be4 100644 --- a/src/include/fmgr.h +++ b/src/include/fmgr.h @@ -848,4 +848,10 @@ extern PGDLLIMPORT fmgr_hook_type fmgr_hook; #define FmgrHookIsNeeded(fn_oid) \ (!needs_fmgr_hook ? false : (*needs_fmgr_hook)(fn_oid)) + + +// download_extension_file_hook (filename, is_library) +typedef bool (*download_extension_file_hook_type) (const char *, bool); +extern PGDLLIMPORT download_extension_file_hook_type download_extension_file_hook; + #endif /* FMGR_H */ From ada24085e4f0817023f867a036717949ab63fb90 Mon Sep 17 00:00:00 2001 From: Matthias van de Meent Date: Fri, 4 Jul 2025 16:04:58 +0200 Subject: [PATCH 28/50] NEON: Add relpersistence to SMgrRelation and smgropen This allows the smgr to make storage choices based on the relpersistence of the relation, which is critical in future commits. Committed separately to make rebasing & backpatching easier. --- src/backend/access/transam/xlogprefetcher.c | 3 +- src/backend/access/transam/xlogutils.c | 4 +-- src/backend/catalog/catalog.c | 2 +- src/backend/catalog/storage.c | 13 +++++---- src/backend/commands/dbcommands.c | 3 +- src/backend/commands/sequence.c | 3 +- src/backend/storage/buffer/bufmgr.c | 32 ++++++++++++--------- src/backend/storage/buffer/localbuf.c | 3 +- src/backend/storage/smgr/md.c | 7 +++-- src/backend/storage/smgr/smgr.c | 21 ++++++++++---- src/backend/utils/cache/relcache.c | 3 +- src/include/storage/aio_types.h | 6 +++- src/include/storage/smgr.h | 9 +++++- src/include/utils/rel.h | 3 +- 14 files changed, 73 insertions(+), 39 deletions(-) diff --git a/src/backend/access/transam/xlogprefetcher.c b/src/backend/access/transam/xlogprefetcher.c index 7735562db01..dbe6733591f 100644 --- a/src/backend/access/transam/xlogprefetcher.c +++ b/src/backend/access/transam/xlogprefetcher.c @@ -716,7 +716,8 @@ XLogPrefetcherNextBlock(uintptr_t pgsr_private, XLogRecPtr *lsn) * same relation (with some scheme to handle invalidations * safely), but for now we'll call smgropen() every time. */ - reln = smgropen(block->rlocator, INVALID_PROC_NUMBER); + reln = smgropen(block->rlocator, INVALID_PROC_NUMBER, + RELPERSISTENCE_PERMANENT); /* * If the relation file doesn't exist on disk, for example because diff --git a/src/backend/access/transam/xlogutils.c b/src/backend/access/transam/xlogutils.c index d2b6c4545d0..552cb92f3e6 100644 --- a/src/backend/access/transam/xlogutils.c +++ b/src/backend/access/transam/xlogutils.c @@ -494,7 +494,7 @@ XLogReadBufferExtended(RelFileLocator rlocator, ForkNumber forknum, } /* Open the relation at smgr level */ - smgr = smgropen(rlocator, INVALID_PROC_NUMBER); + smgr = smgropen(rlocator, INVALID_PROC_NUMBER, RELPERSISTENCE_PERMANENT); /* * Create the target file if it doesn't already exist. This lets us cope @@ -623,7 +623,7 @@ CreateFakeRelcacheEntry(RelFileLocator rlocator) * Set up a non-pinned SMgrRelation reference, so that we don't need to * worry about unpinning it on error. */ - rel->rd_smgr = smgropen(rlocator, INVALID_PROC_NUMBER); + rel->rd_smgr = smgropen(rlocator, INVALID_PROC_NUMBER, RELPERSISTENCE_PERMANENT); return rel; } diff --git a/src/backend/catalog/catalog.c b/src/backend/catalog/catalog.c index 29944bf153e..11f9c3bc170 100644 --- a/src/backend/catalog/catalog.c +++ b/src/backend/catalog/catalog.c @@ -600,7 +600,7 @@ GetNewRelFileNumber(Oid reltablespace, Relation pg_class, char relpersistence) locator.relNumber = GetNewObjectId(); /* Check for existing file of same name */ - srel = smgropen(locator, procNumber); + srel = smgropen(locator, procNumber, relpersistence); collides = smgrexists(srel, MAIN_FORKNUM); smgrclose(srel); } while (collides); diff --git a/src/backend/catalog/storage.c b/src/backend/catalog/storage.c index 5ba81adda4c..516e99ae236 100644 --- a/src/backend/catalog/storage.c +++ b/src/backend/catalog/storage.c @@ -147,7 +147,7 @@ RelationCreateStorage(RelFileLocator rlocator, char relpersistence, return NULL; /* placate compiler */ } - srel = smgropen(rlocator, procNumber); + srel = smgropen(rlocator, procNumber, relpersistence); smgrcreate(srel, MAIN_FORKNUM, false); if (needs_wal) @@ -704,8 +704,7 @@ smgrDoPendingDeletes(bool isCommit) { SMgrRelation srel; - srel = smgropen(pending->rlocator, pending->procNumber); - + srel = smgropen(pending->rlocator, pending->procNumber, 0); /* allocate the initial array, or extend it, if needed */ if (maxrels == 0) { @@ -785,7 +784,7 @@ smgrDoPendingSyncs(bool isCommit, bool isParallelWorker) uint64 total_blocks = 0; SMgrRelation srel; - srel = smgropen(pendingsync->rlocator, INVALID_PROC_NUMBER); + srel = smgropen(pendingsync->rlocator, INVALID_PROC_NUMBER, 0); /* * We emit newpage WAL records for smaller relations. @@ -994,7 +993,8 @@ smgr_redo(XLogReaderState *record) xl_smgr_create *xlrec = (xl_smgr_create *) XLogRecGetData(record); SMgrRelation reln; - reln = smgropen(xlrec->rlocator, INVALID_PROC_NUMBER); + reln = smgropen(xlrec->rlocator, INVALID_PROC_NUMBER, + RELPERSISTENCE_PERMANENT); smgrcreate(reln, xlrec->forkNum, true); } else if (info == XLOG_SMGR_TRUNCATE) @@ -1008,7 +1008,8 @@ smgr_redo(XLogReaderState *record) int nforks = 0; bool need_fsm_vacuum = false; - reln = smgropen(xlrec->rlocator, INVALID_PROC_NUMBER); + reln = smgropen(xlrec->rlocator, INVALID_PROC_NUMBER, + RELPERSISTENCE_PERMANENT); /* * Forcibly create relation if it doesn't exist (which suggests that diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c index 9bc14aa1b49..e9d1d388ae6 100644 --- a/src/backend/commands/dbcommands.c +++ b/src/backend/commands/dbcommands.c @@ -275,7 +275,8 @@ ScanSourceDatabasePgClass(Oid tbid, Oid dbid, char *srcpath) rlocator.dbOid = dbid; rlocator.relNumber = relfilenumber; - smgr = smgropen(rlocator, INVALID_PROC_NUMBER); + /* NEON: This rlocator is for the pg_class table, which is always RP_PERMANENT */ + smgr = smgropen(rlocator, INVALID_PROC_NUMBER, RELPERSISTENCE_PERMANENT); nblocks = smgrnblocks(smgr, MAIN_FORKNUM); smgrclose(smgr); diff --git a/src/backend/commands/sequence.c b/src/backend/commands/sequence.c index 30faf79898a..4e5829bd28c 100644 --- a/src/backend/commands/sequence.c +++ b/src/backend/commands/sequence.c @@ -349,7 +349,8 @@ fill_seq_with_data(Relation rel, HeapTuple tuple) { SMgrRelation srel; - srel = smgropen(rel->rd_locator, INVALID_PROC_NUMBER); + srel = smgropen(rel->rd_locator, INVALID_PROC_NUMBER, + rel->rd_rel->relpersistence); smgrcreate(srel, INIT_FORKNUM, false); log_smgrcreate(&rel->rd_locator, INIT_FORKNUM); fill_seq_fork_with_data(rel, tuple, INIT_FORKNUM); diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index c57c260e14d..04531c14c7a 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -848,10 +848,11 @@ ReadBufferWithoutRelcache(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy, bool permanent) { - SMgrRelation smgr = smgropen(rlocator, INVALID_PROC_NUMBER); + char relpersistence = permanent ? RELPERSISTENCE_PERMANENT : RELPERSISTENCE_UNLOGGED; + SMgrRelation smgr = smgropen(rlocator, INVALID_PROC_NUMBER, relpersistence); return ReadBuffer_common(NULL, smgr, - permanent ? RELPERSISTENCE_PERMANENT : RELPERSISTENCE_UNLOGGED, + relpersistence, forkNum, blockNum, mode, strategy); } @@ -4352,7 +4353,10 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object, /* Find smgr relation for buffer */ if (reln == NULL) - reln = smgropen(BufTagGetRelFileLocator(&buf->tag), INVALID_PROC_NUMBER); + reln = smgropen(BufTagGetRelFileLocator(&buf->tag), INVALID_PROC_NUMBER, + io_object == IOOBJECT_RELATION ? + RELPERSISTENCE_PERMANENT : + RELPERSISTENCE_UNLOGGED); TRACE_POSTGRESQL_BUFFER_FLUSH_START(BufTagGetForkNum(&buf->tag), buf->tag.blockNum, @@ -5177,6 +5181,7 @@ RelationCopyStorageUsingBuffer(RelFileLocator srclocator, Page srcPage; Page dstPage; bool use_wal; + char relpersistence = permanent ? RELPERSISTENCE_PERMANENT : RELPERSISTENCE_UNLOGGED; BlockNumber nblocks; BlockNumber blkno; PGIOAlignedBlock buf; @@ -5194,7 +5199,7 @@ RelationCopyStorageUsingBuffer(RelFileLocator srclocator, use_wal = XLogIsNeeded() && (permanent || forkNum == INIT_FORKNUM); /* Get number of blocks in the source relation. */ - nblocks = smgrnblocks(smgropen(srclocator, INVALID_PROC_NUMBER), + nblocks = smgrnblocks(smgropen(srclocator, INVALID_PROC_NUMBER, relpersistence), forkNum); /* Nothing to copy; just return. */ @@ -5206,7 +5211,8 @@ RelationCopyStorageUsingBuffer(RelFileLocator srclocator, * relation before starting to copy block by block. */ memset(buf.data, 0, BLCKSZ); - smgrextend(smgropen(dstlocator, INVALID_PROC_NUMBER), forkNum, nblocks - 1, + smgrextend(smgropen(dstlocator, INVALID_PROC_NUMBER, relpersistence), + forkNum, nblocks - 1, buf.data, true); /* This is a bulk operation, so use buffer access strategies. */ @@ -5216,7 +5222,7 @@ RelationCopyStorageUsingBuffer(RelFileLocator srclocator, /* Initialize streaming read */ p.current_blocknum = 0; p.last_exclusive = nblocks; - src_smgr = smgropen(srclocator, INVALID_PROC_NUMBER); + src_smgr = smgropen(srclocator, INVALID_PROC_NUMBER, relpersistence); /* * It is safe to use batchmode as block_range_read_stream_cb takes no @@ -5226,7 +5232,7 @@ RelationCopyStorageUsingBuffer(RelFileLocator srclocator, READ_STREAM_USE_BATCHING, bstrategy_src, src_smgr, - permanent ? RELPERSISTENCE_PERMANENT : RELPERSISTENCE_UNLOGGED, + relpersistence, forkNum, block_range_read_stream_cb, &p, @@ -5293,8 +5299,8 @@ CreateAndCopyRelationData(RelFileLocator src_rlocator, relpersistence = permanent ? RELPERSISTENCE_PERMANENT : RELPERSISTENCE_UNLOGGED; - src_rel = smgropen(src_rlocator, INVALID_PROC_NUMBER); - dst_rel = smgropen(dst_rlocator, INVALID_PROC_NUMBER); + src_rel = smgropen(src_rlocator, INVALID_PROC_NUMBER, relpersistence); + dst_rel = smgropen(dst_rlocator, INVALID_PROC_NUMBER, relpersistence); /* * Create and copy all forks of the relation. During create database we @@ -6564,7 +6570,7 @@ IssuePendingWritebacks(WritebackContext *wb_context, IOContext io_context) i += ahead; /* and finally tell the kernel to write the data to storage */ - reln = smgropen(currlocator, INVALID_PROC_NUMBER); + reln = smgropen(currlocator, INVALID_PROC_NUMBER, 0); smgrwriteback(reln, BufTagGetForkNum(&tag), tag.blockNum, nblocks); } @@ -7229,11 +7235,11 @@ buffer_readv_complete(PgAioHandle *ioh, PgAioResult prior_result, if (is_temp) { - Assert(td->smgr.is_temp); + Assert(td->smgr.relpersistence == RELPERSISTENCE_TEMP); Assert(pgaio_io_get_owner(ioh) == MyProcNumber); } else - Assert(!td->smgr.is_temp); + Assert(td->smgr.relpersistence != RELPERSISTENCE_TEMP); /* * Iterate over all the buffers affected by this IO and call the @@ -7322,7 +7328,7 @@ buffer_readv_report(PgAioResult result, const PgAioTargetData *td, BlockNumber first = td->smgr.blockNum; BlockNumber last = first + nblocks - 1; ProcNumber errProc = - td->smgr.is_temp ? MyProcNumber : INVALID_PROC_NUMBER; + (td->smgr.relpersistence == RELPERSISTENCE_TEMP) ? MyProcNumber : INVALID_PROC_NUMBER; RelPathStr rpath = relpathbackend(td->smgr.rlocator, errProc, td->smgr.forkNum); bool zeroed_any, diff --git a/src/backend/storage/buffer/localbuf.c b/src/backend/storage/buffer/localbuf.c index a32fe09138b..2cc65d7ce14 100644 --- a/src/backend/storage/buffer/localbuf.c +++ b/src/backend/storage/buffer/localbuf.c @@ -19,7 +19,6 @@ #include "executor/instrument.h" #include "miscadmin.h" #include "pgstat.h" -#include "../../../../../../pg_install/v17/include/postgresql/server/c.h" #include "storage/aio.h" #include "storage/buf_internals.h" #include "storage/bufmgr.h" @@ -201,7 +200,7 @@ FlushLocalBuffer(BufferDesc *bufHdr, SMgrRelation reln) /* Find smgr relation for buffer */ if (reln == NULL) reln = smgropen(BufTagGetRelFileLocator(&bufHdr->tag), - MyProcNumber); + MyProcNumber, RELPERSISTENCE_TEMP); PageSetChecksumInplace(localpage, bufHdr->tag.blockNum); diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c index 2ccb0faceb5..de99cadb512 100644 --- a/src/backend/storage/smgr/md.c +++ b/src/backend/storage/smgr/md.c @@ -1592,7 +1592,7 @@ DropRelationFiles(RelFileLocator *delrels, int ndelrels, bool isRedo) srels = palloc(sizeof(SMgrRelation) * ndelrels); for (i = 0; i < ndelrels; i++) { - SMgrRelation srel = smgropen(delrels[i], INVALID_PROC_NUMBER); + SMgrRelation srel = smgropen(delrels[i], INVALID_PROC_NUMBER, 0); if (isRedo) { @@ -1879,7 +1879,7 @@ _mdnblocks(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg) int mdsyncfiletag(const FileTag *ftag, char *path) { - SMgrRelation reln = smgropen(ftag->rlocator, INVALID_PROC_NUMBER); + SMgrRelation reln = smgropen(ftag->rlocator, INVALID_PROC_NUMBER, 0); File file; instr_time io_start; bool need_to_close; @@ -2034,7 +2034,8 @@ md_readv_report(PgAioResult result, const PgAioTargetData *td, int elevel) RelPathStr path; path = relpathbackend(td->smgr.rlocator, - td->smgr.is_temp ? MyProcNumber : INVALID_PROC_NUMBER, + td->smgr.relpersistence == RELPERSISTENCE_TEMP ? + MyProcNumber : INVALID_PROC_NUMBER, td->smgr.forkNum); if (result.error_data != 0) diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c index bce37a36d51..fe62770a48d 100644 --- a/src/backend/storage/smgr/smgr.c +++ b/src/backend/storage/smgr/smgr.c @@ -237,7 +237,7 @@ smgrshutdown(int code, Datum arg) * This does not attempt to actually open the underlying files. */ SMgrRelation -smgropen(RelFileLocator rlocator, ProcNumber backend) +smgropen(RelFileLocator rlocator, ProcNumber backend, char relpersistence) { RelFileLocatorBackend brlocator; SMgrRelation reln; @@ -277,11 +277,22 @@ smgropen(RelFileLocator rlocator, ProcNumber backend) /* it is not pinned yet */ reln->pincount = 0; + + /* fill it with the right relpersistence */ + reln->smgr_relpersistence = relpersistence; dlist_push_tail(&unpinned_relns, &reln->node); /* implementation-specific initialization */ smgrsw[reln->smgr_which].smgr_open(reln); } + else if (reln->smgr_relpersistence == 0 && relpersistence != 0) + { + /* + * fix the persistence of the SMgrRelation now that we know the correct + * value + */ + reln->smgr_relpersistence = relpersistence; + } RESUME_INTERRUPTS(); @@ -1042,7 +1053,7 @@ pgaio_io_set_target_smgr(PgAioHandle *ioh, sd->smgr.forkNum = forknum; sd->smgr.blockNum = blocknum; sd->smgr.nblocks = nblocks; - sd->smgr.is_temp = SmgrIsTemp(smgr); + sd->smgr.relpersistence = smgr->smgr_relpersistence; /* Temp relations should never be fsync'd */ sd->smgr.skip_fsync = skip_fsync && !SmgrIsTemp(smgr); } @@ -1066,12 +1077,12 @@ smgr_aio_reopen(PgAioHandle *ioh) */ Assert(!INTERRUPTS_CAN_BE_PROCESSED()); - if (sd->smgr.is_temp) + if (sd->smgr.relpersistence == RELPERSISTENCE_TEMP) procno = pgaio_io_get_owner(ioh); else procno = INVALID_PROC_NUMBER; - reln = smgropen(sd->smgr.rlocator, procno); + reln = smgropen(sd->smgr.rlocator, procno, sd->smgr.relpersistence); switch (pgaio_io_get_op(ioh)) { case PGAIO_OP_INVALID: @@ -1098,7 +1109,7 @@ smgr_aio_describe_identity(const PgAioTargetData *sd) char *desc; path = relpathbackend(sd->smgr.rlocator, - sd->smgr.is_temp ? + sd->smgr.relpersistence == RELPERSISTENCE_TEMP ? MyProcNumber : INVALID_PROC_NUMBER, sd->smgr.forkNum); diff --git a/src/backend/utils/cache/relcache.c b/src/backend/utils/cache/relcache.c index 559ba9cdb2c..d55362117cb 100644 --- a/src/backend/utils/cache/relcache.c +++ b/src/backend/utils/cache/relcache.c @@ -3848,7 +3848,8 @@ RelationSetNewRelfilenumber(Relation relation, char persistence) * fails at this stage, the new cluster will need to be recreated * anyway. */ - srel = smgropen(relation->rd_locator, relation->rd_backend); + srel = smgropen(relation->rd_locator, relation->rd_backend, + persistence); smgrdounlinkall(&srel, 1, false); smgrclose(srel); } diff --git a/src/include/storage/aio_types.h b/src/include/storage/aio_types.h index afee85c787b..d508a416d9d 100644 --- a/src/include/storage/aio_types.h +++ b/src/include/storage/aio_types.h @@ -66,7 +66,11 @@ typedef union PgAioTargetData BlockNumber blockNum; /* blknum relative to begin of reln */ BlockNumber nblocks; ForkNumber forkNum:8; /* don't waste 4 byte for four values */ - bool is_temp:1; /* proc can be inferred by owning AIO */ + /* + * Persistence requirement for this relation. + * proc for RELPERSISTENCE_TEMP can be inferred by owning AIO. + */ + char relpersistence:8; bool skip_fsync:1; } smgr; } PgAioTargetData; diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h index 3964d9334b3..1cb2c2850a8 100644 --- a/src/include/storage/smgr.h +++ b/src/include/storage/smgr.h @@ -15,6 +15,7 @@ #define SMGR_H #include "lib/ilist.h" +#include "catalog/pg_class.h" #include "storage/aio_types.h" #include "storage/block.h" #include "storage/relfilelocator.h" @@ -66,6 +67,11 @@ typedef struct SMgrRelationData * link in list of all unpinned SMgrRelations. */ int pincount; + + /* + * Persistence of the relation + */ + char smgr_relpersistence; dlist_node node; } SMgrRelationData; @@ -77,7 +83,8 @@ typedef SMgrRelationData *SMgrRelation; extern PGDLLIMPORT const PgAioTargetInfo aio_smgr_target_info; extern void smgrinit(void); -extern SMgrRelation smgropen(RelFileLocator rlocator, ProcNumber backend); +extern SMgrRelation smgropen(RelFileLocator rlocator, ProcNumber backend, + char relpersistence); extern bool smgrexists(SMgrRelation reln, ForkNumber forknum); extern void smgrpin(SMgrRelation reln); extern void smgrunpin(SMgrRelation reln); diff --git a/src/include/utils/rel.h b/src/include/utils/rel.h index b552359915f..c684028f197 100644 --- a/src/include/utils/rel.h +++ b/src/include/utils/rel.h @@ -579,7 +579,8 @@ RelationGetSmgr(Relation rel) { if (unlikely(rel->rd_smgr == NULL)) { - rel->rd_smgr = smgropen(rel->rd_locator, rel->rd_backend); + rel->rd_smgr = smgropen(rel->rd_locator, rel->rd_backend, + rel->rd_rel->relpersistence); smgrpin(rel->rd_smgr); } return rel->rd_smgr; From 21eebb4cc2b64b1f6f334bd1b80f470f5ea865f4 Mon Sep 17 00:00:00 2001 From: Matthias van de Meent Date: Fri, 4 Jul 2025 16:21:59 +0200 Subject: [PATCH 29/50] NEON: Move f_smgr typedef into smgr.h from smgr.c This allows other systems to use the type. Committed separately for ease of rebasing & conflict resolution. --- src/backend/storage/smgr/smgr.c | 51 --------------------------------- src/include/storage/smgr.h | 50 ++++++++++++++++++++++++++++++++ 2 files changed, 50 insertions(+), 51 deletions(-) diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c index fe62770a48d..695daa6a14f 100644 --- a/src/backend/storage/smgr/smgr.c +++ b/src/backend/storage/smgr/smgr.c @@ -74,57 +74,6 @@ #include "utils/hsearch.h" #include "utils/inval.h" - -/* - * This struct of function pointers defines the API between smgr.c and - * any individual storage manager module. Note that smgr subfunctions are - * generally expected to report problems via elog(ERROR). An exception is - * that smgr_unlink should use elog(WARNING), rather than erroring out, - * because we normally unlink relations during post-commit/abort cleanup, - * and so it's too late to raise an error. Also, various conditions that - * would normally be errors should be allowed during bootstrap and/or WAL - * recovery --- see comments in md.c for details. - */ -typedef struct f_smgr -{ - void (*smgr_init) (void); /* may be NULL */ - void (*smgr_shutdown) (void); /* may be NULL */ - void (*smgr_open) (SMgrRelation reln); - void (*smgr_close) (SMgrRelation reln, ForkNumber forknum); - void (*smgr_create) (SMgrRelation reln, ForkNumber forknum, - bool isRedo); - bool (*smgr_exists) (SMgrRelation reln, ForkNumber forknum); - void (*smgr_unlink) (RelFileLocatorBackend rlocator, ForkNumber forknum, - bool isRedo); - void (*smgr_extend) (SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum, const void *buffer, bool skipFsync); - void (*smgr_zeroextend) (SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum, int nblocks, bool skipFsync); - bool (*smgr_prefetch) (SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum, int nblocks); - uint32 (*smgr_maxcombine) (SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum); - void (*smgr_readv) (SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum, - void **buffers, BlockNumber nblocks); - void (*smgr_startreadv) (PgAioHandle *ioh, - SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum, - void **buffers, BlockNumber nblocks); - void (*smgr_writev) (SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum, - const void **buffers, BlockNumber nblocks, - bool skipFsync); - void (*smgr_writeback) (SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum, BlockNumber nblocks); - BlockNumber (*smgr_nblocks) (SMgrRelation reln, ForkNumber forknum); - void (*smgr_truncate) (SMgrRelation reln, ForkNumber forknum, - BlockNumber old_blocks, BlockNumber nblocks); - void (*smgr_immedsync) (SMgrRelation reln, ForkNumber forknum); - void (*smgr_registersync) (SMgrRelation reln, ForkNumber forknum); - int (*smgr_fd) (SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, uint32 *off); -} f_smgr; - static const f_smgr smgrsw[] = { /* magnetic disk */ { diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h index 1cb2c2850a8..a9a2d6fa204 100644 --- a/src/include/storage/smgr.h +++ b/src/include/storage/smgr.h @@ -80,6 +80,56 @@ typedef SMgrRelationData *SMgrRelation; #define SmgrIsTemp(smgr) \ RelFileLocatorBackendIsTemp((smgr)->smgr_rlocator) +/* + * This struct of function pointers defines the API between smgr.c and + * any individual storage manager module. Note that smgr subfunctions are + * generally expected to report problems via elog(ERROR). An exception is + * that smgr_unlink should use elog(WARNING), rather than erroring out, + * because we normally unlink relations during post-commit/abort cleanup, + * and so it's too late to raise an error. Also, various conditions that + * would normally be errors should be allowed during bootstrap and/or WAL + * recovery --- see comments in md.c for details. + */ +typedef struct f_smgr +{ + void (*smgr_init) (void); /* may be NULL */ + void (*smgr_shutdown) (void); /* may be NULL */ + void (*smgr_open) (SMgrRelation reln); + void (*smgr_close) (SMgrRelation reln, ForkNumber forknum); + void (*smgr_create) (SMgrRelation reln, ForkNumber forknum, + bool isRedo); + bool (*smgr_exists) (SMgrRelation reln, ForkNumber forknum); + void (*smgr_unlink) (RelFileLocatorBackend rlocator, ForkNumber forknum, + bool isRedo); + void (*smgr_extend) (SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, const void *buffer, bool skipFsync); + void (*smgr_zeroextend) (SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, int nblocks, bool skipFsync); + bool (*smgr_prefetch) (SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, int nblocks); + uint32 (*smgr_maxcombine) (SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum); + void (*smgr_readv) (SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, + void **buffers, BlockNumber nblocks); + void (*smgr_startreadv) (PgAioHandle *ioh, + SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, + void **buffers, BlockNumber nblocks); + void (*smgr_writev) (SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, + const void **buffers, BlockNumber nblocks, + bool skipFsync); + void (*smgr_writeback) (SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, BlockNumber nblocks); + BlockNumber (*smgr_nblocks) (SMgrRelation reln, ForkNumber forknum); + void (*smgr_truncate) (SMgrRelation reln, ForkNumber forknum, + BlockNumber old_blocks, BlockNumber nblocks); + void (*smgr_immedsync) (SMgrRelation reln, ForkNumber forknum); + void (*smgr_registersync) (SMgrRelation reln, ForkNumber forknum); + int (*smgr_fd) (SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, uint32 *off); +} f_smgr; + extern PGDLLIMPORT const PgAioTargetInfo aio_smgr_target_info; extern void smgrinit(void); From 3e55b05acd482d26c9b8543dc86c423b206af356 Mon Sep 17 00:00:00 2001 From: Matthias van de Meent Date: Mon, 7 Jul 2025 15:08:40 +0200 Subject: [PATCH 30/50] NEON: Allow registering new SMgrs Committed separately to make rebasing efforts easier in the future. --- src/backend/storage/smgr/md.c | 7 ++ src/backend/storage/smgr/smgr.c | 114 ++++++++++++++++++++++++-------- src/include/storage/md.h | 1 + src/include/storage/smgr.h | 13 +++- 4 files changed, 105 insertions(+), 30 deletions(-) diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c index de99cadb512..b312d17a50f 100644 --- a/src/backend/storage/smgr/md.c +++ b/src/backend/storage/smgr/md.c @@ -1481,6 +1481,13 @@ mdfd(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, uint32 *off) return FileGetRawDesc(v->mdfd_vfd); } +/* md can always serve the smgr operations of all relfilenodes */ +bool +mdowns(RelFileLocator rlocator, ProcNumber backend, char relpersistence) +{ + return true; +} + /* * register_dirty_segment() -- Mark a relation segment as needing fsync * diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c index 695daa6a14f..264bdb7e55f 100644 --- a/src/backend/storage/smgr/smgr.c +++ b/src/backend/storage/smgr/smgr.c @@ -74,33 +74,34 @@ #include "utils/hsearch.h" #include "utils/inval.h" -static const f_smgr smgrsw[] = { - /* magnetic disk */ - { - .smgr_init = mdinit, - .smgr_shutdown = NULL, - .smgr_open = mdopen, - .smgr_close = mdclose, - .smgr_create = mdcreate, - .smgr_exists = mdexists, - .smgr_unlink = mdunlink, - .smgr_extend = mdextend, - .smgr_zeroextend = mdzeroextend, - .smgr_prefetch = mdprefetch, - .smgr_maxcombine = mdmaxcombine, - .smgr_readv = mdreadv, - .smgr_startreadv = mdstartreadv, - .smgr_writev = mdwritev, - .smgr_writeback = mdwriteback, - .smgr_nblocks = mdnblocks, - .smgr_truncate = mdtruncate, - .smgr_immedsync = mdimmedsync, - .smgr_registersync = mdregistersync, - .smgr_fd = mdfd, - } +static const f_smgr smgr_md = { + .smgr_name = "md", + .smgr_init = mdinit, + .smgr_shutdown = NULL, + .smgr_open = mdopen, + .smgr_close = mdclose, + .smgr_create = mdcreate, + .smgr_exists = mdexists, + .smgr_unlink = mdunlink, + .smgr_extend = mdextend, + .smgr_zeroextend = mdzeroextend, + .smgr_prefetch = mdprefetch, + .smgr_maxcombine = mdmaxcombine, + .smgr_readv = mdreadv, + .smgr_startreadv = mdstartreadv, + .smgr_writev = mdwritev, + .smgr_writeback = mdwriteback, + .smgr_nblocks = mdnblocks, + .smgr_truncate = mdtruncate, + .smgr_immedsync = mdimmedsync, + .smgr_registersync = mdregistersync, + .smgr_fd = mdfd, + .smgr_owns = mdowns, }; -static const int NSmgr = lengthof(smgrsw); +static const f_smgr *smgrsw = &smgr_md; + +static int NSmgr = 1; /* * Each backend has a hashtable that stores all extant SMgrRelation objects. @@ -124,6 +125,39 @@ const PgAioTargetInfo aio_smgr_target_info = { .describe_identity = smgr_aio_describe_identity, }; +SmgrId +smgrregister(const f_smgr *smgr) +{ + f_smgr *new_smgrsw; + + for (int i = 0; i < NSmgr; i++) + { + /* different name */ + if (strcmp(smgr->smgr_name, smgrsw[i].smgr_name) != 0) + continue; + /* exactly the same struct, return the previously-returned ID */ + if (memcmp(smgr, &smgrsw[i], sizeof(f_smgr))) + return i; + + /* different contents on same name, error out*/ + ereport(ERROR, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("smgr \"%s\" already defined with ID %d", + smgr->smgr_name, i))); + } + + if (NSmgr == 1) + { + new_smgrsw = malloc(sizeof(f_smgr) * 2); + memcpy(new_smgrsw, smgrsw, sizeof(f_smgr)); + } + else + new_smgrsw = realloc((f_smgr *) smgrsw, sizeof(f_smgr) * (NSmgr + 1)); + + new_smgrsw[NSmgr] = *smgr; + smgrsw = new_smgrsw; + return NSmgr++; +} /* * smgrinit(), smgrshutdown() -- Initialize or shut down storage @@ -218,11 +252,23 @@ smgropen(RelFileLocator rlocator, ProcNumber backend, char relpersistence) /* Initialize it if not present before */ if (!found) { + SmgrId smgrid = 0; /* hash_search already filled in the lookup key */ reln->smgr_targblock = InvalidBlockNumber; for (int i = 0; i <= MAX_FORKNUM; ++i) reln->smgr_cached_nblocks[i] = InvalidBlockNumber; - reln->smgr_which = 0; /* we only have md.c at present */ + + /* most recently registered SMgr wins, so we iterate backwards */ + for (int i = NSmgr; i > 0; i--) + { + smgrid = i - 1; + + if (smgrsw[smgrid].smgr_owns(rlocator, backend, relpersistence)) + break; + } + + Assert(smgrid >= 0); + reln->smgr_which = smgrid; /* it is not pinned yet */ reln->pincount = 0; @@ -237,10 +283,20 @@ smgropen(RelFileLocator rlocator, ProcNumber backend, char relpersistence) else if (reln->smgr_relpersistence == 0 && relpersistence != 0) { /* - * fix the persistence of the SMgrRelation now that we know the correct - * value + * Fix the persistence of the SMgrRelation if we didn't know it already. + * + * If we knew the persistence already, make sure that it hasn't changed. */ - reln->smgr_relpersistence = relpersistence; + if (reln->smgr_relpersistence == 0) + { + reln->smgr_relpersistence = relpersistence; + } + else if (reln->smgr_relpersistence != 0 && relpersistence != 0 && + reln->smgr_relpersistence != relpersistence) + { + elog(ERROR, "relpersistence mismatch: smgropen %c vs SmgrRelation %c", + relpersistence, reln->smgr_relpersistence); + } } RESUME_INTERRUPTS(); diff --git a/src/include/storage/md.h b/src/include/storage/md.h index b563c27abf0..683fa7eacf7 100644 --- a/src/include/storage/md.h +++ b/src/include/storage/md.h @@ -53,6 +53,7 @@ extern void mdtruncate(SMgrRelation reln, ForkNumber forknum, extern void mdimmedsync(SMgrRelation reln, ForkNumber forknum); extern void mdregistersync(SMgrRelation reln, ForkNumber forknum); extern int mdfd(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, uint32 *off); +extern bool mdowns(RelFileLocator rlocator, ProcNumber backend, char relpersistence); extern void ForgetDatabaseSyncRequests(Oid dbid); extern void DropRelationFiles(RelFileLocator *delrels, int ndelrels, bool isRedo); diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h index a9a2d6fa204..6f058216570 100644 --- a/src/include/storage/smgr.h +++ b/src/include/storage/smgr.h @@ -20,6 +20,10 @@ #include "storage/block.h" #include "storage/relfilelocator.h" +typedef int SmgrId; + +#define SMGRID_MD ((SmgrId) 0) + /* * smgr.c maintains a table of SMgrRelation objects, which are essentially * cached file handles. An SMgrRelation is created (if not already present) @@ -53,7 +57,7 @@ typedef struct SMgrRelationData * Fields below here are intended to be private to smgr.c and its * submodules. Do not touch them from elsewhere. */ - int smgr_which; /* storage manager selector */ + SmgrId smgr_which; /* storage manager selector */ /* * for md.c; per-fork arrays of the number of open segments @@ -92,6 +96,7 @@ typedef SMgrRelationData *SMgrRelation; */ typedef struct f_smgr { + const char *smgr_name; void (*smgr_init) (void); /* may be NULL */ void (*smgr_shutdown) (void); /* may be NULL */ void (*smgr_open) (SMgrRelation reln); @@ -128,10 +133,16 @@ typedef struct f_smgr void (*smgr_immedsync) (SMgrRelation reln, ForkNumber forknum); void (*smgr_registersync) (SMgrRelation reln, ForkNumber forknum); int (*smgr_fd) (SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, uint32 *off); + + /* NEON: test if this smgr is responsible for this relation */ + bool (*smgr_owns) (RelFileLocator rlocator, ProcNumber backend, + char relpersistence); } f_smgr; extern PGDLLIMPORT const PgAioTargetInfo aio_smgr_target_info; +extern SmgrId smgrregister(const f_smgr *smgr); + extern void smgrinit(void); extern SMgrRelation smgropen(RelFileLocator rlocator, ProcNumber backend, char relpersistence); From 75ce7dfe8e2a8fd96d073308683631ee1371d332 Mon Sep 17 00:00:00 2001 From: Matthias van de Meent Date: Mon, 7 Jul 2025 16:20:46 +0200 Subject: [PATCH 31/50] NEON: Add smgr hooks for 2-phase relation builds It enables relation builds to be bulk-logged after all relation pages have been filled. Committed separately for ease of rebasing. --- src/backend/access/gin/gininsert.c | 6 ++++++ src/backend/access/gist/gistbuild.c | 6 ++++++ src/backend/access/spgist/spginsert.c | 6 ++++++ src/backend/storage/smgr/smgr.c | 29 ++++++++++++++++++++++++++- src/include/storage/smgr.h | 10 +++++++++ 5 files changed, 56 insertions(+), 1 deletion(-) diff --git a/src/backend/access/gin/gininsert.c b/src/backend/access/gin/gininsert.c index a65acd89104..b4cd0073259 100644 --- a/src/backend/access/gin/gininsert.c +++ b/src/backend/access/gin/gininsert.c @@ -597,6 +597,8 @@ ginbuild(Relation heap, Relation index, IndexInfo *indexInfo) elog(ERROR, "index \"%s\" already contains data", RelationGetRelationName(index)); + smgr_start_unlogged_build(RelationGetSmgr(index)); + initGinState(&buildstate.ginstate, index); buildstate.indtuples = 0; memset(&buildstate.buildStats, 0, sizeof(GinStatsData)); @@ -746,6 +748,8 @@ ginbuild(Relation heap, Relation index, IndexInfo *indexInfo) buildstate.buildStats.nTotalPages = RelationGetNumberOfBlocks(index); ginUpdateStats(index, &buildstate.buildStats, true); + smgr_finish_unlogged_build_phase_1(RelationGetSmgr(index)); + /* * We didn't write WAL records as we built the index, so if WAL-logging is * required, write all pages to the WAL now. @@ -757,6 +761,8 @@ ginbuild(Relation heap, Relation index, IndexInfo *indexInfo) true); } + smgr_end_unlogged_build(RelationGetSmgr(index)); + /* * Return statistics */ diff --git a/src/backend/access/gist/gistbuild.c b/src/backend/access/gist/gistbuild.c index 11ab63d6a8d..4e54053ac35 100644 --- a/src/backend/access/gist/gistbuild.c +++ b/src/backend/access/gist/gistbuild.c @@ -295,6 +295,8 @@ gistbuild(Relation heap, Relation index, IndexInfo *indexInfo) Buffer buffer; Page page; + smgr_start_unlogged_build(RelationGetSmgr(index)); + /* initialize the root page */ buffer = gistNewBuffer(index, heap); Assert(BufferGetBlockNumber(buffer) == GIST_ROOT_BLKNO); @@ -327,6 +329,8 @@ gistbuild(Relation heap, Relation index, IndexInfo *indexInfo) gistFreeBuildBuffers(buildstate.gfbb); } + smgr_finish_unlogged_build_phase_1(RelationGetSmgr(index)); + /* * We didn't write WAL records as we built the index, so if * WAL-logging is required, write all pages to the WAL now. @@ -337,6 +341,8 @@ gistbuild(Relation heap, Relation index, IndexInfo *indexInfo) 0, RelationGetNumberOfBlocks(index), true); } + + smgr_end_unlogged_build(RelationGetSmgr(index)); } /* okay, all heap tuples are indexed */ diff --git a/src/backend/access/spgist/spginsert.c b/src/backend/access/spgist/spginsert.c index 6a61e093fa0..a309d1e6d94 100644 --- a/src/backend/access/spgist/spginsert.c +++ b/src/backend/access/spgist/spginsert.c @@ -83,6 +83,8 @@ spgbuild(Relation heap, Relation index, IndexInfo *indexInfo) elog(ERROR, "index \"%s\" already contains data", RelationGetRelationName(index)); + smgr_start_unlogged_build(RelationGetSmgr(index)); + /* * Initialize the meta page and root pages */ @@ -129,6 +131,8 @@ spgbuild(Relation heap, Relation index, IndexInfo *indexInfo) SpGistUpdateMetaPage(index); + smgr_finish_unlogged_build_phase_1(RelationGetSmgr(index)); + /* * We didn't write WAL records as we built the index, so if WAL-logging is * required, write all pages to the WAL now. @@ -140,6 +144,8 @@ spgbuild(Relation heap, Relation index, IndexInfo *indexInfo) true); } + smgr_end_unlogged_build(RelationGetSmgr(index)); + result = (IndexBuildResult *) palloc0(sizeof(IndexBuildResult)); result->heap_tuples = reltuples; result->index_tuples = buildstate.indtuples; diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c index 264bdb7e55f..7eb728ef1d9 100644 --- a/src/backend/storage/smgr/smgr.c +++ b/src/backend/storage/smgr/smgr.c @@ -97,6 +97,9 @@ static const f_smgr smgr_md = { .smgr_registersync = mdregistersync, .smgr_fd = mdfd, .smgr_owns = mdowns, + .smgr_start_unlogged_build = NULL, + .smgr_finish_unlogged_build_phase_1 = NULL, + .smgr_end_unlogged_build = NULL, }; static const f_smgr *smgrsw = &smgr_md; @@ -280,7 +283,7 @@ smgropen(RelFileLocator rlocator, ProcNumber backend, char relpersistence) /* implementation-specific initialization */ smgrsw[reln->smgr_which].smgr_open(reln); } - else if (reln->smgr_relpersistence == 0 && relpersistence != 0) + else { /* * Fix the persistence of the SMgrRelation if we didn't know it already. @@ -1008,6 +1011,30 @@ smgrfd(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, uint32 *off) return fd; } +/* + * NEON: functions to mark the phases of an unlogged index build. + */ +void +smgr_start_unlogged_build(SMgrRelation reln) +{ + if (smgrsw[reln->smgr_which].smgr_start_unlogged_build) + smgrsw[reln->smgr_which].smgr_start_unlogged_build(reln); +} + +void +smgr_finish_unlogged_build_phase_1(SMgrRelation reln) +{ + if (smgrsw[reln->smgr_which].smgr_finish_unlogged_build_phase_1) + smgrsw[reln->smgr_which].smgr_finish_unlogged_build_phase_1(reln); +} + +void +smgr_end_unlogged_build(SMgrRelation reln) +{ + if (smgrsw[reln->smgr_which].smgr_end_unlogged_build) + smgrsw[reln->smgr_which].smgr_end_unlogged_build(reln); +} + /* * AtEOXact_SMgr * diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h index 6f058216570..53cd5a32f89 100644 --- a/src/include/storage/smgr.h +++ b/src/include/storage/smgr.h @@ -137,6 +137,11 @@ typedef struct f_smgr /* NEON: test if this smgr is responsible for this relation */ bool (*smgr_owns) (RelFileLocator rlocator, ProcNumber backend, char relpersistence); + + /* NEON: Hooks to smgr for unlogged build phases */ + void (*smgr_start_unlogged_build) (SMgrRelation reln); + void (*smgr_finish_unlogged_build_phase_1) (SMgrRelation reln); + void (*smgr_end_unlogged_build) (SMgrRelation reln); } f_smgr; extern PGDLLIMPORT const PgAioTargetInfo aio_smgr_target_info; @@ -188,6 +193,11 @@ extern void smgrregistersync(SMgrRelation reln, ForkNumber forknum); extern void AtEOXact_SMgr(void); extern bool ProcessBarrierSmgrRelease(void); +/* Neon: Change relpersistence for unlogged index builds */ +extern void smgr_start_unlogged_build(SMgrRelation reln); +extern void smgr_finish_unlogged_build_phase_1(SMgrRelation reln); +extern void smgr_end_unlogged_build(SMgrRelation reln); + static inline void smgrread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, void *buffer) From 45dc835f577eb94455c6fc832f9b7917e9d2bddc Mon Sep 17 00:00:00 2001 From: Matthias van de Meent Date: Tue, 8 Jul 2025 13:32:01 +0200 Subject: [PATCH 32/50] NEON: Add hook in smgr to allow on-demand SLRU segment download This allows us to remove SLRU segments from Neon's minimal-sized basebackups. --- src/backend/access/transam/slru.c | 97 +++++++++++++++++++++++++++---- src/backend/storage/smgr/smgr.c | 27 +++++++++ src/include/storage/smgr.h | 4 ++ 3 files changed, 116 insertions(+), 12 deletions(-) diff --git a/src/backend/access/transam/slru.c b/src/backend/access/transam/slru.c index fe56286d9a9..2cc3b962ee5 100644 --- a/src/backend/access/transam/slru.c +++ b/src/backend/access/transam/slru.c @@ -71,6 +71,7 @@ #include "storage/fd.h" #include "storage/shmem.h" #include "utils/guc.h" +#include "storage/smgr.h" /* * Converts segment number to the filename of the segment. @@ -723,6 +724,59 @@ SlruInternalWritePage(SlruCtl ctl, int slotno, SlruWriteAll fdata) } } +/* + * NEON: we do not want to include large pg_xact/multixact files in basebackup and prefer + * to download them on demand to reduce startup time. + * If SLRU segment is not found, we try to download it from page server + */ +static int +SimpleLruDownloadSegment(SlruCtl ctl, int pageno, char const* path) +{ + SlruShared shared = ctl->shared; + int segno; + int fd = -1; + int n_blocks; + char* buffer; + + /* If page is greater than latest written page, then do not try to download segment from server */ + if (ctl->PagePrecedes(pg_atomic_read_u64(&shared->latest_page_number), pageno)) + return -1; + + segno = pageno / SLRU_PAGES_PER_SEGMENT; + + buffer = palloc(BLCKSZ * SLRU_PAGES_PER_SEGMENT); + n_blocks = smgr_read_slru_segment(path, segno, buffer); + if (n_blocks > 0) + { + fd = OpenTransientFile(path, O_RDWR | O_CREAT | PG_BINARY); + if (fd < 0) + { + slru_errcause = SLRU_OPEN_FAILED; + slru_errno = errno; + pfree(buffer); + return -1; + } + errno = 0; + pgstat_report_wait_start(WAIT_EVENT_SLRU_WRITE); + if (pg_pwrite(fd, buffer, n_blocks*BLCKSZ, 0) != n_blocks*BLCKSZ) + { + pgstat_report_wait_end(); + /* if write didn't set errno, assume problem is no disk space */ + if (errno == 0) + errno = ENOSPC; + slru_errcause = SLRU_WRITE_FAILED; + slru_errno = errno; + + CloseTransientFile(fd); + pfree(buffer); + return -1; + } + pgstat_report_wait_end(); + } + pfree(buffer); + return fd; +} + /* * Wrapper of SlruInternalWritePage, for external callers. * fdata is always passed a NULL here. @@ -762,12 +816,18 @@ SimpleLruDoesPhysicalPageExist(SlruCtl ctl, int64 pageno) { /* expected: file doesn't exist */ if (errno == ENOENT) - return false; - - /* report error normally */ - slru_errcause = SLRU_OPEN_FAILED; - slru_errno = errno; - SlruReportIOError(ctl, pageno, 0); + { + fd = SimpleLruDownloadSegment(ctl, pageno, path); + if (fd < 0) + return false; + } + else + { + /* report error normally */ + slru_errcause = SLRU_OPEN_FAILED; + slru_errno = errno; + SlruReportIOError(ctl, pageno, 0); + } } if ((endpos = lseek(fd, 0, SEEK_END)) < 0) @@ -821,18 +881,31 @@ SlruPhysicalReadPage(SlruCtl ctl, int64 pageno, int slotno) fd = OpenTransientFile(path, O_RDONLY | PG_BINARY); if (fd < 0) { - if (errno != ENOENT || !InRecovery) + if (errno != ENOENT) { slru_errcause = SLRU_OPEN_FAILED; slru_errno = errno; return false; } - ereport(LOG, - (errmsg("file \"%s\" doesn't exist, reading as zeroes", - path))); - MemSet(shared->page_buffer[slotno], 0, BLCKSZ); - return true; + fd = SimpleLruDownloadSegment(ctl, pageno, path); + if (fd < 0) + { + if (!InRecovery) + { + slru_errcause = SLRU_OPEN_FAILED; + slru_errno = errno; + return false; + } + else + { + ereport(LOG, + (errmsg("file \"%s\" doesn't exist, reading as zeroes", + path))); + MemSet(shared->page_buffer[slotno], 0, BLCKSZ); + return true; + } + } } errno = 0; diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c index 7eb728ef1d9..10aaaf82423 100644 --- a/src/backend/storage/smgr/smgr.c +++ b/src/backend/storage/smgr/smgr.c @@ -1035,6 +1035,33 @@ smgr_end_unlogged_build(SMgrRelation reln) smgrsw[reln->smgr_which].smgr_end_unlogged_build(reln); } +/* + * NEON: we do not want to include large pg_xact/multixact files in basebackup and prefer + * to download them on demand to reduce startup time. + * If SLRU segment is not found, we try to download it from page server + * + * This function returns number of blocks in segment. Usually it should be SLRU_PAGES_PER_SEGMENT but in case + * of partial segment, it can be smaller. Zero value means that segment doesn't exist. + * From Postgres point of view empty segment is the same as absent segment. + * + * This should really be a separate hook, not something that's in the smgr API, but + * oh well. + */ +int +smgr_read_slru_segment(const char* path, int segno, void* buffer) +{ + for (int i = NSmgr; i > 0; i--) + { + SmgrId smgr = (i - 1); + if (smgrsw[smgr].smgr_read_slru_segment) + { + return smgrsw[smgr].smgr_read_slru_segment(path, segno, buffer); + } + } + + return 0; +} + /* * AtEOXact_SMgr * diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h index 53cd5a32f89..ba78ca514f4 100644 --- a/src/include/storage/smgr.h +++ b/src/include/storage/smgr.h @@ -142,6 +142,7 @@ typedef struct f_smgr void (*smgr_start_unlogged_build) (SMgrRelation reln); void (*smgr_finish_unlogged_build_phase_1) (SMgrRelation reln); void (*smgr_end_unlogged_build) (SMgrRelation reln); + int (*smgr_read_slru_segment) (const char *path, int segno, void* buffer); } f_smgr; extern PGDLLIMPORT const PgAioTargetInfo aio_smgr_target_info; @@ -198,6 +199,9 @@ extern void smgr_start_unlogged_build(SMgrRelation reln); extern void smgr_finish_unlogged_build_phase_1(SMgrRelation reln); extern void smgr_end_unlogged_build(SMgrRelation reln); +/* Neon: Allow on-demand download of SLRU segment data */ +extern int smgr_read_slru_segment(const char *path, int segno, void* buffer); + static inline void smgrread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, void *buffer) From 9321993ba22c66d76dcaabc1a83e660450d20636 Mon Sep 17 00:00:00 2001 From: Matthias van de Meent Date: Tue, 8 Jul 2025 13:33:15 +0200 Subject: [PATCH 33/50] NEON: Add #define to distinguish between vanilla PG and Neon's fork --- src/include/storage/smgr.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h index ba78ca514f4..57e01473607 100644 --- a/src/include/storage/smgr.h +++ b/src/include/storage/smgr.h @@ -20,6 +20,12 @@ #include "storage/block.h" #include "storage/relfilelocator.h" +/* + * Neon: extended SMGR API. + * This define can be used by extensions to determine that them are built for Neon. + */ +#define NEON_SMGR 1 + typedef int SmgrId; #define SMGRID_MD ((SmgrId) 0) From 8f0b228dd158c724a9173c72d62f54a2315149c7 Mon Sep 17 00:00:00 2001 From: Matthias van de Meent Date: Tue, 8 Jul 2025 15:31:36 +0200 Subject: [PATCH 34/50] NEON: Add dbsize hook, allowing efficient database size calculations. --- src/backend/utils/adt/dbsize.c | 67 +++++++++++++++++----------------- src/include/storage/smgr.h | 4 ++ 2 files changed, 37 insertions(+), 34 deletions(-) diff --git a/src/backend/utils/adt/dbsize.c b/src/backend/utils/adt/dbsize.c index 25865b660ef..991b1a83444 100644 --- a/src/backend/utils/adt/dbsize.c +++ b/src/backend/utils/adt/dbsize.c @@ -23,6 +23,7 @@ #include "commands/tablespace.h" #include "miscadmin.h" #include "storage/fd.h" +#include "storage/smgr.h" #include "utils/acl.h" #include "utils/builtins.h" #include "utils/numeric.h" @@ -111,6 +112,8 @@ db_dir_size(const char *path) return dirsize; } +dbsize_hook_type dbsize_hook = NULL; + /* * calculate size of database in all tablespaces */ @@ -140,6 +143,13 @@ calculate_database_size(Oid dbOid) /* Include pg_default storage */ snprintf(pathname, sizeof(pathname), "base/%u", dbOid); + + if (dbsize_hook) + { + totalsize = (*dbsize_hook)(dbOid); + return totalsize; + } + totalsize = db_dir_size(pathname); /* Scan the non-default tablespaces */ @@ -323,41 +333,26 @@ pg_tablespace_size_name(PG_FUNCTION_ARGS) * is no check here or at the call sites for that. */ static int64 -calculate_relation_size(RelFileLocator *rfn, ProcNumber backend, ForkNumber forknum) +calculate_relation_size(RelFileLocator *rfn, ProcNumber backend, + ForkNumber forknum, char relpersistence) { - int64 totalsize = 0; - RelPathStr relationpath; - char pathname[MAXPGPATH]; - unsigned int segcount = 0; + const RelFileLocator zero = {0, 0, 0}; - relationpath = relpathbackend(*rfn, backend, forknum); + /* + * We can get asked the size of a relation without storage, in which case + * we get passed a zero RFL. + * Return 0 for those relations. + */ + if (RelFileLocatorEquals(*rfn, zero)) + return 0; - for (segcount = 0;; segcount++) + if (smgrexists(smgropen(*rfn, backend, relpersistence), forknum)) { - struct stat fst; - - CHECK_FOR_INTERRUPTS(); - - if (segcount == 0) - snprintf(pathname, MAXPGPATH, "%s", - relationpath.str); - else - snprintf(pathname, MAXPGPATH, "%s.%u", - relationpath.str, segcount); - - if (stat(pathname, &fst) < 0) - { - if (errno == ENOENT) - break; - else - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not stat file \"%s\": %m", pathname))); - } - totalsize += fst.st_size; + BlockNumber n = smgrnblocks(smgropen(*rfn, backend, relpersistence), forknum); + return (int64) n * BLCKSZ; } - return totalsize; + return 0; } Datum @@ -381,7 +376,8 @@ pg_relation_size(PG_FUNCTION_ARGS) PG_RETURN_NULL(); size = calculate_relation_size(&(rel->rd_locator), rel->rd_backend, - forkname_to_number(text_to_cstring(forkName))); + forkname_to_number(text_to_cstring(forkName)), + rel->rd_rel->relpersistence); relation_close(rel, AccessShareLock); @@ -406,7 +402,8 @@ calculate_toast_table_size(Oid toastrelid) /* toast heap size, including FSM and VM size */ for (forkNum = 0; forkNum <= MAX_FORKNUM; forkNum++) size += calculate_relation_size(&(toastRel->rd_locator), - toastRel->rd_backend, forkNum); + toastRel->rd_backend, forkNum, + toastRel->rd_rel->relpersistence); /* toast index size, including FSM and VM size */ indexlist = RelationGetIndexList(toastRel); @@ -420,7 +417,8 @@ calculate_toast_table_size(Oid toastrelid) AccessShareLock); for (forkNum = 0; forkNum <= MAX_FORKNUM; forkNum++) size += calculate_relation_size(&(toastIdxRel->rd_locator), - toastIdxRel->rd_backend, forkNum); + toastIdxRel->rd_backend, forkNum, + toastIdxRel->rd_rel->relpersistence); relation_close(toastIdxRel, AccessShareLock); } @@ -449,7 +447,7 @@ calculate_table_size(Relation rel) */ for (forkNum = 0; forkNum <= MAX_FORKNUM; forkNum++) size += calculate_relation_size(&(rel->rd_locator), rel->rd_backend, - forkNum); + forkNum, rel->rd_rel->relpersistence); /* * Size of toast relation @@ -489,7 +487,8 @@ calculate_indexes_size(Relation rel) for (forkNum = 0; forkNum <= MAX_FORKNUM; forkNum++) size += calculate_relation_size(&(idxRel->rd_locator), idxRel->rd_backend, - forkNum); + forkNum, + idxRel->rd_rel->relpersistence); relation_close(idxRel, AccessShareLock); } diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h index 57e01473607..8c66597dc17 100644 --- a/src/include/storage/smgr.h +++ b/src/include/storage/smgr.h @@ -151,6 +151,10 @@ typedef struct f_smgr int (*smgr_read_slru_segment) (const char *path, int segno, void* buffer); } f_smgr; +/* NEON: Alternative implementation of calculate_database_size(), to make it O(1) */ +typedef int64 (*dbsize_hook_type) (Oid dbOid); +extern PGDLLIMPORT dbsize_hook_type dbsize_hook; + extern PGDLLIMPORT const PgAioTargetInfo aio_smgr_target_info; extern SmgrId smgrregister(const f_smgr *smgr); From f3a2f1e76bb8b565413303972d9a45b4520bcd1e Mon Sep 17 00:00:00 2001 From: Matthias van de Meent Date: Tue, 8 Jul 2025 15:32:57 +0200 Subject: [PATCH 35/50] NEON: Add alternative tablespace_1.out Neon doesn't currently handle tablespaces in the way PostgreSQL expects, so the original tablespace.out would not allow all tests to succeed. --- src/test/regress/expected/tablespace_1.out | 974 +++++++++++++++++++++ 1 file changed, 974 insertions(+) create mode 100644 src/test/regress/expected/tablespace_1.out diff --git a/src/test/regress/expected/tablespace_1.out b/src/test/regress/expected/tablespace_1.out new file mode 100644 index 00000000000..5b9cefa6057 --- /dev/null +++ b/src/test/regress/expected/tablespace_1.out @@ -0,0 +1,974 @@ +-- relative tablespace locations are not allowed +CREATE TABLESPACE regress_tblspace LOCATION 'relative'; -- fail +ERROR: tablespace location must be an absolute path +-- empty tablespace locations are not usually allowed +CREATE TABLESPACE regress_tblspace LOCATION ''; -- fail +ERROR: tablespace location must be an absolute path +-- as a special developer-only option to allow us to use tablespaces +-- with streaming replication on the same server, an empty location +-- can be allowed as a way to say that the tablespace should be created +-- as a directory in pg_tblspc, rather than being a symlink +SET allow_in_place_tablespaces = true; +-- create a tablespace using WITH clause +CREATE TABLESPACE regress_tblspacewith LOCATION '' WITH (some_nonexistent_parameter = true); -- fail +ERROR: unrecognized parameter "some_nonexistent_parameter" +CREATE TABLESPACE regress_tblspacewith LOCATION '' WITH (random_page_cost = 3.0); -- ok +-- check to see the parameter was used +SELECT spcoptions FROM pg_tablespace WHERE spcname = 'regress_tblspacewith'; + spcoptions +------------------------ + {random_page_cost=3.0} +(1 row) + +-- drop the tablespace so we can re-use the location +DROP TABLESPACE regress_tblspacewith; +-- This returns a relative path as of an effect of allow_in_place_tablespaces, +-- masking the tablespace OID used in the path name. +SELECT regexp_replace(pg_tablespace_location(oid), '(pg_tblspc)/(\d+)', '\1/NNN') + FROM pg_tablespace WHERE spcname = 'regress_tblspace'; + regexp_replace +---------------- + pg_tblspc/NNN +(1 row) + +-- try setting and resetting some properties for the new tablespace +ALTER TABLESPACE regress_tblspace SET (random_page_cost = 1.0, seq_page_cost = 1.1); +ALTER TABLESPACE regress_tblspace SET (some_nonexistent_parameter = true); -- fail +ERROR: unrecognized parameter "some_nonexistent_parameter" +ALTER TABLESPACE regress_tblspace RESET (random_page_cost = 2.0); -- fail +ERROR: RESET must not include values for parameters +ALTER TABLESPACE regress_tblspace RESET (random_page_cost, effective_io_concurrency); -- ok +-- REINDEX (TABLESPACE) +-- catalogs and system tablespaces +-- system catalog, fail +REINDEX (TABLESPACE regress_tblspace) TABLE pg_am; +ERROR: cannot move system relation "pg_am_name_index" +REINDEX (TABLESPACE regress_tblspace) TABLE CONCURRENTLY pg_am; +ERROR: cannot reindex system catalogs concurrently +-- shared catalog, fail +REINDEX (TABLESPACE regress_tblspace) TABLE pg_authid; +ERROR: cannot move system relation "pg_authid_rolname_index" +REINDEX (TABLESPACE regress_tblspace) TABLE CONCURRENTLY pg_authid; +ERROR: cannot reindex system catalogs concurrently +-- toast relations, fail +REINDEX (TABLESPACE regress_tblspace) INDEX pg_toast.pg_toast_1260_index; +ERROR: cannot move system relation "pg_toast_1260_index" +REINDEX (TABLESPACE regress_tblspace) INDEX CONCURRENTLY pg_toast.pg_toast_1260_index; +ERROR: cannot reindex system catalogs concurrently +REINDEX (TABLESPACE regress_tblspace) TABLE pg_toast.pg_toast_1260; +ERROR: cannot move system relation "pg_toast_1260_index" +REINDEX (TABLESPACE regress_tblspace) TABLE CONCURRENTLY pg_toast.pg_toast_1260; +ERROR: cannot reindex system catalogs concurrently +-- system catalog, fail +REINDEX (TABLESPACE pg_global) TABLE pg_authid; +ERROR: cannot move system relation "pg_authid_rolname_index" +REINDEX (TABLESPACE pg_global) TABLE CONCURRENTLY pg_authid; +ERROR: cannot reindex system catalogs concurrently +-- table with toast relation +CREATE TABLE regress_tblspace_test_tbl (num1 bigint, num2 double precision, t text); +INSERT INTO regress_tblspace_test_tbl (num1, num2, t) + SELECT round(random()*100), random(), 'text' + FROM generate_series(1, 10) s(i); +CREATE INDEX regress_tblspace_test_tbl_idx ON regress_tblspace_test_tbl (num1); +-- move to global tablespace, fail +REINDEX (TABLESPACE pg_global) INDEX regress_tblspace_test_tbl_idx; +ERROR: only shared relations can be placed in pg_global tablespace +REINDEX (TABLESPACE pg_global) INDEX CONCURRENTLY regress_tblspace_test_tbl_idx; +ERROR: cannot move non-shared relation to tablespace "pg_global" +-- check transactional behavior of REINDEX (TABLESPACE) +BEGIN; +REINDEX (TABLESPACE regress_tblspace) INDEX regress_tblspace_test_tbl_idx; +REINDEX (TABLESPACE regress_tblspace) TABLE regress_tblspace_test_tbl; +ROLLBACK; +-- no relation moved to the new tablespace +SELECT c.relname FROM pg_class c, pg_tablespace s + WHERE c.reltablespace = s.oid AND s.spcname = 'regress_tblspace'; + relname +--------- +(0 rows) + +-- check that all indexes are moved to a new tablespace with different +-- relfilenode. +-- Save first the existing relfilenode for the toast and main relations. +SELECT relfilenode as main_filenode FROM pg_class + WHERE relname = 'regress_tblspace_test_tbl_idx' \gset +SELECT relfilenode as toast_filenode FROM pg_class + WHERE oid = + (SELECT i.indexrelid + FROM pg_class c, + pg_index i + WHERE i.indrelid = c.reltoastrelid AND + c.relname = 'regress_tblspace_test_tbl') \gset +REINDEX (TABLESPACE regress_tblspace) TABLE regress_tblspace_test_tbl; +SELECT c.relname FROM pg_class c, pg_tablespace s + WHERE c.reltablespace = s.oid AND s.spcname = 'regress_tblspace' + ORDER BY c.relname; + relname +------------------------------- + regress_tblspace_test_tbl_idx +(1 row) + +ALTER TABLE regress_tblspace_test_tbl SET TABLESPACE regress_tblspace; +ALTER TABLE regress_tblspace_test_tbl SET TABLESPACE pg_default; +SELECT c.relname FROM pg_class c, pg_tablespace s + WHERE c.reltablespace = s.oid AND s.spcname = 'regress_tblspace' + ORDER BY c.relname; + relname +------------------------------- + regress_tblspace_test_tbl_idx +(1 row) + +-- Move back to the default tablespace. +ALTER INDEX regress_tblspace_test_tbl_idx SET TABLESPACE pg_default; +SELECT c.relname FROM pg_class c, pg_tablespace s + WHERE c.reltablespace = s.oid AND s.spcname = 'regress_tblspace' + ORDER BY c.relname; + relname +--------- +(0 rows) + +REINDEX (TABLESPACE regress_tblspace, CONCURRENTLY) TABLE regress_tblspace_test_tbl; +SELECT c.relname FROM pg_class c, pg_tablespace s + WHERE c.reltablespace = s.oid AND s.spcname = 'regress_tblspace' + ORDER BY c.relname; + relname +------------------------------- + regress_tblspace_test_tbl_idx +(1 row) + +SELECT relfilenode = :main_filenode AS main_same FROM pg_class + WHERE relname = 'regress_tblspace_test_tbl_idx'; + main_same +----------- + f +(1 row) + +SELECT relfilenode = :toast_filenode as toast_same FROM pg_class + WHERE oid = + (SELECT i.indexrelid + FROM pg_class c, + pg_index i + WHERE i.indrelid = c.reltoastrelid AND + c.relname = 'regress_tblspace_test_tbl'); + toast_same +------------ + f +(1 row) + +DROP TABLE regress_tblspace_test_tbl; +-- REINDEX (TABLESPACE) with partitions +-- Create a partition tree and check the set of relations reindexed +-- with their new tablespace. +CREATE TABLE tbspace_reindex_part (c1 int, c2 int) PARTITION BY RANGE (c1); +CREATE TABLE tbspace_reindex_part_0 PARTITION OF tbspace_reindex_part + FOR VALUES FROM (0) TO (10) PARTITION BY list (c2); +CREATE TABLE tbspace_reindex_part_0_1 PARTITION OF tbspace_reindex_part_0 + FOR VALUES IN (1); +CREATE TABLE tbspace_reindex_part_0_2 PARTITION OF tbspace_reindex_part_0 + FOR VALUES IN (2); +-- This partitioned table will have no partitions. +CREATE TABLE tbspace_reindex_part_10 PARTITION OF tbspace_reindex_part + FOR VALUES FROM (10) TO (20) PARTITION BY list (c2); +-- Create some partitioned indexes +CREATE INDEX tbspace_reindex_part_index ON ONLY tbspace_reindex_part (c1); +CREATE INDEX tbspace_reindex_part_index_0 ON ONLY tbspace_reindex_part_0 (c1); +ALTER INDEX tbspace_reindex_part_index ATTACH PARTITION tbspace_reindex_part_index_0; +-- This partitioned index will have no partitions. +CREATE INDEX tbspace_reindex_part_index_10 ON ONLY tbspace_reindex_part_10 (c1); +ALTER INDEX tbspace_reindex_part_index ATTACH PARTITION tbspace_reindex_part_index_10; +CREATE INDEX tbspace_reindex_part_index_0_1 ON ONLY tbspace_reindex_part_0_1 (c1); +ALTER INDEX tbspace_reindex_part_index_0 ATTACH PARTITION tbspace_reindex_part_index_0_1; +CREATE INDEX tbspace_reindex_part_index_0_2 ON ONLY tbspace_reindex_part_0_2 (c1); +ALTER INDEX tbspace_reindex_part_index_0 ATTACH PARTITION tbspace_reindex_part_index_0_2; +SELECT relid, parentrelid, level FROM pg_partition_tree('tbspace_reindex_part_index') + ORDER BY relid, level; + relid | parentrelid | level +--------------------------------+------------------------------+------- + tbspace_reindex_part_index | | 0 + tbspace_reindex_part_index_0 | tbspace_reindex_part_index | 1 + tbspace_reindex_part_index_10 | tbspace_reindex_part_index | 1 + tbspace_reindex_part_index_0_1 | tbspace_reindex_part_index_0 | 2 + tbspace_reindex_part_index_0_2 | tbspace_reindex_part_index_0 | 2 +(5 rows) + +-- Track the original tablespace, relfilenode and OID of each index +-- in the tree. +CREATE TEMP TABLE reindex_temp_before AS + SELECT oid, relname, relfilenode, reltablespace + FROM pg_class + WHERE relname ~ 'tbspace_reindex_part_index'; +REINDEX (TABLESPACE regress_tblspace, CONCURRENTLY) TABLE tbspace_reindex_part; +-- REINDEX CONCURRENTLY changes the OID of the old relation, hence a check +-- based on the relation name below. +SELECT b.relname, + CASE WHEN a.relfilenode = b.relfilenode THEN 'relfilenode is unchanged' + ELSE 'relfilenode has changed' END AS filenode, + CASE WHEN a.reltablespace = b.reltablespace THEN 'reltablespace is unchanged' + ELSE 'reltablespace has changed' END AS tbspace + FROM reindex_temp_before b JOIN pg_class a ON b.relname = a.relname + ORDER BY 1; + relname | filenode | tbspace +--------------------------------+--------------------------+---------------------------- + tbspace_reindex_part_index | relfilenode is unchanged | reltablespace is unchanged + tbspace_reindex_part_index_0 | relfilenode is unchanged | reltablespace is unchanged + tbspace_reindex_part_index_0_1 | relfilenode has changed | reltablespace has changed + tbspace_reindex_part_index_0_2 | relfilenode has changed | reltablespace has changed + tbspace_reindex_part_index_10 | relfilenode is unchanged | reltablespace is unchanged +(5 rows) + +DROP TABLE tbspace_reindex_part; +-- create a schema we can use +CREATE SCHEMA testschema; +-- try a table +CREATE TABLE testschema.foo (i int) TABLESPACE regress_tblspace; +SELECT relname, spcname FROM pg_catalog.pg_tablespace t, pg_catalog.pg_class c + where c.reltablespace = t.oid AND c.relname = 'foo'; + relname | spcname +---------+------------------ + foo | regress_tblspace +(1 row) + +INSERT INTO testschema.foo VALUES(1); +INSERT INTO testschema.foo VALUES(2); +-- tables from dynamic sources +CREATE TABLE testschema.asselect TABLESPACE regress_tblspace AS SELECT 1; +SELECT relname, spcname FROM pg_catalog.pg_tablespace t, pg_catalog.pg_class c + where c.reltablespace = t.oid AND c.relname = 'asselect'; + relname | spcname +----------+------------------ + asselect | regress_tblspace +(1 row) + +PREPARE selectsource(int) AS SELECT $1; +CREATE TABLE testschema.asexecute TABLESPACE regress_tblspace + AS EXECUTE selectsource(2); +SELECT relname, spcname FROM pg_catalog.pg_tablespace t, pg_catalog.pg_class c + where c.reltablespace = t.oid AND c.relname = 'asexecute'; + relname | spcname +-----------+------------------ + asexecute | regress_tblspace +(1 row) + +-- index +CREATE INDEX foo_idx on testschema.foo(i) TABLESPACE regress_tblspace; +SELECT relname, spcname FROM pg_catalog.pg_tablespace t, pg_catalog.pg_class c + where c.reltablespace = t.oid AND c.relname = 'foo_idx'; + relname | spcname +---------+------------------ + foo_idx | regress_tblspace +(1 row) + +-- check \d output +\d testschema.foo + Table "testschema.foo" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + i | integer | | | +Indexes: + "foo_idx" btree (i), tablespace "regress_tblspace" +Tablespace: "regress_tblspace" + +\d testschema.foo_idx + Index "testschema.foo_idx" + Column | Type | Key? | Definition +--------+---------+------+------------ + i | integer | yes | i +btree, for table "testschema.foo" +Tablespace: "regress_tblspace" + +-- +-- partitioned table +-- +CREATE TABLE testschema.part (a int) PARTITION BY LIST (a); +SET default_tablespace TO pg_global; +CREATE TABLE testschema.part_1 PARTITION OF testschema.part FOR VALUES IN (1); +ERROR: only shared relations can be placed in pg_global tablespace +RESET default_tablespace; +CREATE TABLE testschema.part_1 PARTITION OF testschema.part FOR VALUES IN (1); +SET default_tablespace TO regress_tblspace; +CREATE TABLE testschema.part_2 PARTITION OF testschema.part FOR VALUES IN (2); +SET default_tablespace TO pg_global; +CREATE TABLE testschema.part_3 PARTITION OF testschema.part FOR VALUES IN (3); +ERROR: only shared relations can be placed in pg_global tablespace +ALTER TABLE testschema.part SET TABLESPACE regress_tblspace; +CREATE TABLE testschema.part_3 PARTITION OF testschema.part FOR VALUES IN (3); +CREATE TABLE testschema.part_4 PARTITION OF testschema.part FOR VALUES IN (4) + TABLESPACE pg_default; +CREATE TABLE testschema.part_56 PARTITION OF testschema.part FOR VALUES IN (5, 6) + PARTITION BY LIST (a); +ALTER TABLE testschema.part SET TABLESPACE pg_default; +CREATE TABLE testschema.part_78 PARTITION OF testschema.part FOR VALUES IN (7, 8) + PARTITION BY LIST (a); +ERROR: only shared relations can be placed in pg_global tablespace +CREATE TABLE testschema.part_910 PARTITION OF testschema.part FOR VALUES IN (9, 10) + PARTITION BY LIST (a) TABLESPACE regress_tblspace; +RESET default_tablespace; +CREATE TABLE testschema.part_78 PARTITION OF testschema.part FOR VALUES IN (7, 8) + PARTITION BY LIST (a); +SELECT relname, spcname FROM pg_catalog.pg_class c + JOIN pg_catalog.pg_namespace n ON (c.relnamespace = n.oid) + LEFT JOIN pg_catalog.pg_tablespace t ON c.reltablespace = t.oid + where c.relname LIKE 'part%' AND n.nspname = 'testschema' order by relname; + relname | spcname +----------+------------------ + part | + part_1 | + part_2 | regress_tblspace + part_3 | regress_tblspace + part_4 | + part_56 | regress_tblspace + part_78 | + part_910 | regress_tblspace +(8 rows) + +RESET default_tablespace; +DROP TABLE testschema.part; +-- partitioned index +CREATE TABLE testschema.part (a int) PARTITION BY LIST (a); +CREATE TABLE testschema.part1 PARTITION OF testschema.part FOR VALUES IN (1); +CREATE INDEX part_a_idx ON testschema.part (a) TABLESPACE regress_tblspace; +CREATE TABLE testschema.part2 PARTITION OF testschema.part FOR VALUES IN (2); +SELECT relname, spcname FROM pg_catalog.pg_tablespace t, pg_catalog.pg_class c + where c.reltablespace = t.oid AND c.relname LIKE 'part%_idx' ORDER BY relname; + relname | spcname +-------------+------------------ + part1_a_idx | regress_tblspace + part2_a_idx | regress_tblspace + part_a_idx | regress_tblspace +(3 rows) + +\d testschema.part + Partitioned table "testschema.part" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | | +Partition key: LIST (a) +Indexes: + "part_a_idx" btree (a), tablespace "regress_tblspace" +Number of partitions: 2 (Use \d+ to list them.) + +\d+ testschema.part + Partitioned table "testschema.part" + Column | Type | Collation | Nullable | Default | Storage | Stats target | Description +--------+---------+-----------+----------+---------+---------+--------------+------------- + a | integer | | | | plain | | +Partition key: LIST (a) +Indexes: + "part_a_idx" btree (a), tablespace "regress_tblspace" +Partitions: testschema.part1 FOR VALUES IN (1), + testschema.part2 FOR VALUES IN (2) + +\d testschema.part1 + Table "testschema.part1" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | | +Partition of: testschema.part FOR VALUES IN (1) +Indexes: + "part1_a_idx" btree (a), tablespace "regress_tblspace" + +\d+ testschema.part1 + Table "testschema.part1" + Column | Type | Collation | Nullable | Default | Storage | Stats target | Description +--------+---------+-----------+----------+---------+---------+--------------+------------- + a | integer | | | | plain | | +Partition of: testschema.part FOR VALUES IN (1) +Partition constraint: ((a IS NOT NULL) AND (a = 1)) +Indexes: + "part1_a_idx" btree (a), tablespace "regress_tblspace" + +\d testschema.part_a_idx +Partitioned index "testschema.part_a_idx" + Column | Type | Key? | Definition +--------+---------+------+------------ + a | integer | yes | a +btree, for table "testschema.part" +Number of partitions: 2 (Use \d+ to list them.) +Tablespace: "regress_tblspace" + +\d+ testschema.part_a_idx + Partitioned index "testschema.part_a_idx" + Column | Type | Key? | Definition | Storage | Stats target +--------+---------+------+------------+---------+-------------- + a | integer | yes | a | plain | +btree, for table "testschema.part" +Partitions: testschema.part1_a_idx, + testschema.part2_a_idx +Tablespace: "regress_tblspace" + +-- partitioned rels cannot specify the default tablespace. These fail: +CREATE TABLE testschema.dflt (a int PRIMARY KEY) PARTITION BY LIST (a) TABLESPACE pg_default; +ERROR: cannot specify default tablespace for partitioned relations +CREATE TABLE testschema.dflt (a int PRIMARY KEY USING INDEX TABLESPACE pg_default) PARTITION BY LIST (a); +ERROR: cannot specify default tablespace for partitioned relations +SET default_tablespace TO 'pg_default'; +CREATE TABLE testschema.dflt (a int PRIMARY KEY) PARTITION BY LIST (a) TABLESPACE regress_tblspace; +ERROR: cannot specify default tablespace for partitioned relations +CREATE TABLE testschema.dflt (a int PRIMARY KEY USING INDEX TABLESPACE regress_tblspace) PARTITION BY LIST (a); +ERROR: cannot specify default tablespace for partitioned relations +-- but these work: +CREATE TABLE testschema.dflt (a int PRIMARY KEY USING INDEX TABLESPACE regress_tblspace) PARTITION BY LIST (a) TABLESPACE regress_tblspace; +SET default_tablespace TO ''; +CREATE TABLE testschema.dflt2 (a int PRIMARY KEY) PARTITION BY LIST (a); +DROP TABLE testschema.dflt, testschema.dflt2; +-- check that default_tablespace doesn't affect ALTER TABLE index rebuilds +CREATE TABLE testschema.test_default_tab(id bigint) TABLESPACE regress_tblspace; +INSERT INTO testschema.test_default_tab VALUES (1); +CREATE INDEX test_index1 on testschema.test_default_tab (id); +CREATE INDEX test_index2 on testschema.test_default_tab (id) TABLESPACE regress_tblspace; +ALTER TABLE testschema.test_default_tab ADD CONSTRAINT test_index3 PRIMARY KEY (id); +ALTER TABLE testschema.test_default_tab ADD CONSTRAINT test_index4 UNIQUE (id) USING INDEX TABLESPACE regress_tblspace; +\d testschema.test_index1 + Index "testschema.test_index1" + Column | Type | Key? | Definition +--------+--------+------+------------ + id | bigint | yes | id +btree, for table "testschema.test_default_tab" + +\d testschema.test_index2 + Index "testschema.test_index2" + Column | Type | Key? | Definition +--------+--------+------+------------ + id | bigint | yes | id +btree, for table "testschema.test_default_tab" +Tablespace: "regress_tblspace" + +\d testschema.test_index3 + Index "testschema.test_index3" + Column | Type | Key? | Definition +--------+--------+------+------------ + id | bigint | yes | id +primary key, btree, for table "testschema.test_default_tab" + +\d testschema.test_index4 + Index "testschema.test_index4" + Column | Type | Key? | Definition +--------+--------+------+------------ + id | bigint | yes | id +unique, btree, for table "testschema.test_default_tab" +Tablespace: "regress_tblspace" + +-- use a custom tablespace for default_tablespace +SET default_tablespace TO regress_tblspace; +-- tablespace should not change if no rewrite +ALTER TABLE testschema.test_default_tab ALTER id TYPE bigint; +\d testschema.test_index1 + Index "testschema.test_index1" + Column | Type | Key? | Definition +--------+--------+------+------------ + id | bigint | yes | id +btree, for table "testschema.test_default_tab" + +\d testschema.test_index2 + Index "testschema.test_index2" + Column | Type | Key? | Definition +--------+--------+------+------------ + id | bigint | yes | id +btree, for table "testschema.test_default_tab" +Tablespace: "regress_tblspace" + +\d testschema.test_index3 + Index "testschema.test_index3" + Column | Type | Key? | Definition +--------+--------+------+------------ + id | bigint | yes | id +primary key, btree, for table "testschema.test_default_tab" + +\d testschema.test_index4 + Index "testschema.test_index4" + Column | Type | Key? | Definition +--------+--------+------+------------ + id | bigint | yes | id +unique, btree, for table "testschema.test_default_tab" +Tablespace: "regress_tblspace" + +SELECT * FROM testschema.test_default_tab; + id +---- + 1 +(1 row) + +-- tablespace should not change even if there is an index rewrite +ALTER TABLE testschema.test_default_tab ALTER id TYPE int; +\d testschema.test_index1 + Index "testschema.test_index1" + Column | Type | Key? | Definition +--------+---------+------+------------ + id | integer | yes | id +btree, for table "testschema.test_default_tab" + +\d testschema.test_index2 + Index "testschema.test_index2" + Column | Type | Key? | Definition +--------+---------+------+------------ + id | integer | yes | id +btree, for table "testschema.test_default_tab" +Tablespace: "regress_tblspace" + +\d testschema.test_index3 + Index "testschema.test_index3" + Column | Type | Key? | Definition +--------+---------+------+------------ + id | integer | yes | id +primary key, btree, for table "testschema.test_default_tab" + +\d testschema.test_index4 + Index "testschema.test_index4" + Column | Type | Key? | Definition +--------+---------+------+------------ + id | integer | yes | id +unique, btree, for table "testschema.test_default_tab" +Tablespace: "regress_tblspace" + +SELECT * FROM testschema.test_default_tab; + id +---- + 1 +(1 row) + +-- now use the default tablespace for default_tablespace +SET default_tablespace TO ''; +-- tablespace should not change if no rewrite +ALTER TABLE testschema.test_default_tab ALTER id TYPE int; +\d testschema.test_index1 + Index "testschema.test_index1" + Column | Type | Key? | Definition +--------+---------+------+------------ + id | integer | yes | id +btree, for table "testschema.test_default_tab" + +\d testschema.test_index2 + Index "testschema.test_index2" + Column | Type | Key? | Definition +--------+---------+------+------------ + id | integer | yes | id +btree, for table "testschema.test_default_tab" +Tablespace: "regress_tblspace" + +\d testschema.test_index3 + Index "testschema.test_index3" + Column | Type | Key? | Definition +--------+---------+------+------------ + id | integer | yes | id +primary key, btree, for table "testschema.test_default_tab" + +\d testschema.test_index4 + Index "testschema.test_index4" + Column | Type | Key? | Definition +--------+---------+------+------------ + id | integer | yes | id +unique, btree, for table "testschema.test_default_tab" +Tablespace: "regress_tblspace" + +-- tablespace should not change even if there is an index rewrite +ALTER TABLE testschema.test_default_tab ALTER id TYPE bigint; +\d testschema.test_index1 + Index "testschema.test_index1" + Column | Type | Key? | Definition +--------+--------+------+------------ + id | bigint | yes | id +btree, for table "testschema.test_default_tab" + +\d testschema.test_index2 + Index "testschema.test_index2" + Column | Type | Key? | Definition +--------+--------+------+------------ + id | bigint | yes | id +btree, for table "testschema.test_default_tab" +Tablespace: "regress_tblspace" + +\d testschema.test_index3 + Index "testschema.test_index3" + Column | Type | Key? | Definition +--------+--------+------+------------ + id | bigint | yes | id +primary key, btree, for table "testschema.test_default_tab" + +\d testschema.test_index4 + Index "testschema.test_index4" + Column | Type | Key? | Definition +--------+--------+------+------------ + id | bigint | yes | id +unique, btree, for table "testschema.test_default_tab" +Tablespace: "regress_tblspace" + +DROP TABLE testschema.test_default_tab; +-- check that default_tablespace doesn't affect ALTER TABLE index rebuilds +-- (this time with a partitioned table) +CREATE TABLE testschema.test_default_tab_p(id bigint, val bigint) + PARTITION BY LIST (id) TABLESPACE regress_tblspace; +CREATE TABLE testschema.test_default_tab_p1 PARTITION OF testschema.test_default_tab_p + FOR VALUES IN (1); +INSERT INTO testschema.test_default_tab_p VALUES (1); +CREATE INDEX test_index1 on testschema.test_default_tab_p (val); +CREATE INDEX test_index2 on testschema.test_default_tab_p (val) TABLESPACE regress_tblspace; +ALTER TABLE testschema.test_default_tab_p ADD CONSTRAINT test_index3 PRIMARY KEY (id); +ALTER TABLE testschema.test_default_tab_p ADD CONSTRAINT test_index4 UNIQUE (id) USING INDEX TABLESPACE regress_tblspace; +\d testschema.test_index1 +Partitioned index "testschema.test_index1" + Column | Type | Key? | Definition +--------+--------+------+------------ + val | bigint | yes | val +btree, for table "testschema.test_default_tab_p" +Number of partitions: 1 (Use \d+ to list them.) + +\d testschema.test_index2 +Partitioned index "testschema.test_index2" + Column | Type | Key? | Definition +--------+--------+------+------------ + val | bigint | yes | val +btree, for table "testschema.test_default_tab_p" +Number of partitions: 1 (Use \d+ to list them.) +Tablespace: "regress_tblspace" + +\d testschema.test_index3 +Partitioned index "testschema.test_index3" + Column | Type | Key? | Definition +--------+--------+------+------------ + id | bigint | yes | id +primary key, btree, for table "testschema.test_default_tab_p" +Number of partitions: 1 (Use \d+ to list them.) + +\d testschema.test_index4 +Partitioned index "testschema.test_index4" + Column | Type | Key? | Definition +--------+--------+------+------------ + id | bigint | yes | id +unique, btree, for table "testschema.test_default_tab_p" +Number of partitions: 1 (Use \d+ to list them.) +Tablespace: "regress_tblspace" + +-- use a custom tablespace for default_tablespace +SET default_tablespace TO regress_tblspace; +-- tablespace should not change if no rewrite +ALTER TABLE testschema.test_default_tab_p ALTER val TYPE bigint; +\d testschema.test_index1 +Partitioned index "testschema.test_index1" + Column | Type | Key? | Definition +--------+--------+------+------------ + val | bigint | yes | val +btree, for table "testschema.test_default_tab_p" +Number of partitions: 1 (Use \d+ to list them.) + +\d testschema.test_index2 +Partitioned index "testschema.test_index2" + Column | Type | Key? | Definition +--------+--------+------+------------ + val | bigint | yes | val +btree, for table "testschema.test_default_tab_p" +Number of partitions: 1 (Use \d+ to list them.) +Tablespace: "regress_tblspace" + +\d testschema.test_index3 +Partitioned index "testschema.test_index3" + Column | Type | Key? | Definition +--------+--------+------+------------ + id | bigint | yes | id +primary key, btree, for table "testschema.test_default_tab_p" +Number of partitions: 1 (Use \d+ to list them.) + +\d testschema.test_index4 +Partitioned index "testschema.test_index4" + Column | Type | Key? | Definition +--------+--------+------+------------ + id | bigint | yes | id +unique, btree, for table "testschema.test_default_tab_p" +Number of partitions: 1 (Use \d+ to list them.) +Tablespace: "regress_tblspace" + +SELECT * FROM testschema.test_default_tab_p; + id | val +----+----- + 1 | +(1 row) + +-- tablespace should not change even if there is an index rewrite +ALTER TABLE testschema.test_default_tab_p ALTER val TYPE int; +\d testschema.test_index1 +Partitioned index "testschema.test_index1" + Column | Type | Key? | Definition +--------+---------+------+------------ + val | integer | yes | val +btree, for table "testschema.test_default_tab_p" +Number of partitions: 1 (Use \d+ to list them.) + +\d testschema.test_index2 +Partitioned index "testschema.test_index2" + Column | Type | Key? | Definition +--------+---------+------+------------ + val | integer | yes | val +btree, for table "testschema.test_default_tab_p" +Number of partitions: 1 (Use \d+ to list them.) +Tablespace: "regress_tblspace" + +\d testschema.test_index3 +Partitioned index "testschema.test_index3" + Column | Type | Key? | Definition +--------+--------+------+------------ + id | bigint | yes | id +primary key, btree, for table "testschema.test_default_tab_p" +Number of partitions: 1 (Use \d+ to list them.) + +\d testschema.test_index4 +Partitioned index "testschema.test_index4" + Column | Type | Key? | Definition +--------+--------+------+------------ + id | bigint | yes | id +unique, btree, for table "testschema.test_default_tab_p" +Number of partitions: 1 (Use \d+ to list them.) +Tablespace: "regress_tblspace" + +SELECT * FROM testschema.test_default_tab_p; + id | val +----+----- + 1 | +(1 row) + +-- now use the default tablespace for default_tablespace +SET default_tablespace TO ''; +-- tablespace should not change if no rewrite +ALTER TABLE testschema.test_default_tab_p ALTER val TYPE int; +\d testschema.test_index1 +Partitioned index "testschema.test_index1" + Column | Type | Key? | Definition +--------+---------+------+------------ + val | integer | yes | val +btree, for table "testschema.test_default_tab_p" +Number of partitions: 1 (Use \d+ to list them.) + +\d testschema.test_index2 +Partitioned index "testschema.test_index2" + Column | Type | Key? | Definition +--------+---------+------+------------ + val | integer | yes | val +btree, for table "testschema.test_default_tab_p" +Number of partitions: 1 (Use \d+ to list them.) +Tablespace: "regress_tblspace" + +\d testschema.test_index3 +Partitioned index "testschema.test_index3" + Column | Type | Key? | Definition +--------+--------+------+------------ + id | bigint | yes | id +primary key, btree, for table "testschema.test_default_tab_p" +Number of partitions: 1 (Use \d+ to list them.) + +\d testschema.test_index4 +Partitioned index "testschema.test_index4" + Column | Type | Key? | Definition +--------+--------+------+------------ + id | bigint | yes | id +unique, btree, for table "testschema.test_default_tab_p" +Number of partitions: 1 (Use \d+ to list them.) +Tablespace: "regress_tblspace" + +-- tablespace should not change even if there is an index rewrite +ALTER TABLE testschema.test_default_tab_p ALTER val TYPE bigint; +\d testschema.test_index1 +Partitioned index "testschema.test_index1" + Column | Type | Key? | Definition +--------+--------+------+------------ + val | bigint | yes | val +btree, for table "testschema.test_default_tab_p" +Number of partitions: 1 (Use \d+ to list them.) + +\d testschema.test_index2 +Partitioned index "testschema.test_index2" + Column | Type | Key? | Definition +--------+--------+------+------------ + val | bigint | yes | val +btree, for table "testschema.test_default_tab_p" +Number of partitions: 1 (Use \d+ to list them.) +Tablespace: "regress_tblspace" + +\d testschema.test_index3 +Partitioned index "testschema.test_index3" + Column | Type | Key? | Definition +--------+--------+------+------------ + id | bigint | yes | id +primary key, btree, for table "testschema.test_default_tab_p" +Number of partitions: 1 (Use \d+ to list them.) + +\d testschema.test_index4 +Partitioned index "testschema.test_index4" + Column | Type | Key? | Definition +--------+--------+------+------------ + id | bigint | yes | id +unique, btree, for table "testschema.test_default_tab_p" +Number of partitions: 1 (Use \d+ to list them.) +Tablespace: "regress_tblspace" + +DROP TABLE testschema.test_default_tab_p; +-- check that default_tablespace affects index additions in ALTER TABLE +CREATE TABLE testschema.test_tab(id int) TABLESPACE regress_tblspace; +INSERT INTO testschema.test_tab VALUES (1); +SET default_tablespace TO regress_tblspace; +ALTER TABLE testschema.test_tab ADD CONSTRAINT test_tab_unique UNIQUE (id); +SET default_tablespace TO ''; +ALTER TABLE testschema.test_tab ADD CONSTRAINT test_tab_pkey PRIMARY KEY (id); +\d testschema.test_tab_unique + Index "testschema.test_tab_unique" + Column | Type | Key? | Definition +--------+---------+------+------------ + id | integer | yes | id +unique, btree, for table "testschema.test_tab" +Tablespace: "regress_tblspace" + +\d testschema.test_tab_pkey + Index "testschema.test_tab_pkey" + Column | Type | Key? | Definition +--------+---------+------+------------ + id | integer | yes | id +primary key, btree, for table "testschema.test_tab" + +SELECT * FROM testschema.test_tab; + id +---- + 1 +(1 row) + +DROP TABLE testschema.test_tab; +-- check that default_tablespace is handled correctly by multi-command +-- ALTER TABLE that includes a tablespace-preserving rewrite +CREATE TABLE testschema.test_tab(a int, b int, c int); +SET default_tablespace TO regress_tblspace; +ALTER TABLE testschema.test_tab ADD CONSTRAINT test_tab_unique UNIQUE (a); +CREATE INDEX test_tab_a_idx ON testschema.test_tab (a); +SET default_tablespace TO ''; +CREATE INDEX test_tab_b_idx ON testschema.test_tab (b); +\d testschema.test_tab_unique + Index "testschema.test_tab_unique" + Column | Type | Key? | Definition +--------+---------+------+------------ + a | integer | yes | a +unique, btree, for table "testschema.test_tab" +Tablespace: "regress_tblspace" + +\d testschema.test_tab_a_idx + Index "testschema.test_tab_a_idx" + Column | Type | Key? | Definition +--------+---------+------+------------ + a | integer | yes | a +btree, for table "testschema.test_tab" +Tablespace: "regress_tblspace" + +\d testschema.test_tab_b_idx + Index "testschema.test_tab_b_idx" + Column | Type | Key? | Definition +--------+---------+------+------------ + b | integer | yes | b +btree, for table "testschema.test_tab" + +ALTER TABLE testschema.test_tab ALTER b TYPE bigint, ADD UNIQUE (c); +\d testschema.test_tab_unique + Index "testschema.test_tab_unique" + Column | Type | Key? | Definition +--------+---------+------+------------ + a | integer | yes | a +unique, btree, for table "testschema.test_tab" +Tablespace: "regress_tblspace" + +\d testschema.test_tab_a_idx + Index "testschema.test_tab_a_idx" + Column | Type | Key? | Definition +--------+---------+------+------------ + a | integer | yes | a +btree, for table "testschema.test_tab" +Tablespace: "regress_tblspace" + +\d testschema.test_tab_b_idx + Index "testschema.test_tab_b_idx" + Column | Type | Key? | Definition +--------+--------+------+------------ + b | bigint | yes | b +btree, for table "testschema.test_tab" + +DROP TABLE testschema.test_tab; +-- let's try moving a table from one place to another +CREATE TABLE testschema.atable AS VALUES (1), (2); +CREATE UNIQUE INDEX anindex ON testschema.atable(column1); +ALTER TABLE testschema.atable SET TABLESPACE regress_tblspace; +ALTER INDEX testschema.anindex SET TABLESPACE regress_tblspace; +ALTER INDEX testschema.part_a_idx SET TABLESPACE pg_global; +ERROR: only shared relations can be placed in pg_global tablespace +ALTER INDEX testschema.part_a_idx SET TABLESPACE pg_default; +ALTER INDEX testschema.part_a_idx SET TABLESPACE regress_tblspace; +INSERT INTO testschema.atable VALUES(3); -- ok +INSERT INTO testschema.atable VALUES(1); -- fail (checks index) +ERROR: duplicate key value violates unique constraint "anindex" +DETAIL: Key (column1)=(1) already exists. +SELECT COUNT(*) FROM testschema.atable; -- checks heap + count +------- + 3 +(1 row) + +-- let's try moving a materialized view from one place to another +CREATE MATERIALIZED VIEW testschema.amv AS SELECT * FROM testschema.atable; +ALTER MATERIALIZED VIEW testschema.amv SET TABLESPACE regress_tblspace; +REFRESH MATERIALIZED VIEW testschema.amv; +SELECT COUNT(*) FROM testschema.amv; + count +------- + 3 +(1 row) + +-- Will fail with bad path +CREATE TABLESPACE regress_badspace LOCATION '/no/such/location'; +ERROR: directory "/no/such/location" does not exist +-- No such tablespace +CREATE TABLE bar (i int) TABLESPACE regress_nosuchspace; +ERROR: tablespace "regress_nosuchspace" does not exist +-- Fail, in use for some partitioned object +DROP TABLESPACE regress_tblspace; +ERROR: tablespace "regress_tblspace" cannot be dropped because some objects depend on it +DETAIL: tablespace for index testschema.part_a_idx +ALTER INDEX testschema.part_a_idx SET TABLESPACE pg_default; +-- Fail, not empty +DROP TABLESPACE regress_tblspace; +CREATE ROLE regress_tablespace_user1 login; +CREATE ROLE regress_tablespace_user2 login; +GRANT USAGE ON SCHEMA testschema TO regress_tablespace_user2; +ALTER TABLESPACE regress_tblspace OWNER TO regress_tablespace_user1; +ERROR: tablespace "regress_tblspace" does not exist +CREATE TABLE testschema.tablespace_acl (c int); +-- new owner lacks permission to create this index from scratch +CREATE INDEX k ON testschema.tablespace_acl (c) TABLESPACE regress_tblspace; +ERROR: tablespace "regress_tblspace" does not exist +ALTER TABLE testschema.tablespace_acl OWNER TO regress_tablespace_user2; +SET SESSION ROLE regress_tablespace_user2; +CREATE TABLE tablespace_table (i int) TABLESPACE regress_tblspace; -- fail +ERROR: tablespace "regress_tblspace" does not exist +ALTER TABLE testschema.tablespace_acl ALTER c TYPE bigint; +REINDEX (TABLESPACE regress_tblspace) TABLE tablespace_table; -- fail +ERROR: tablespace "regress_tblspace" does not exist +REINDEX (TABLESPACE regress_tblspace, CONCURRENTLY) TABLE tablespace_table; -- fail +ERROR: tablespace "regress_tblspace" does not exist +RESET ROLE; +ALTER TABLESPACE regress_tblspace RENAME TO regress_tblspace_renamed; +ERROR: tablespace "regress_tblspace" does not exist +ALTER TABLE ALL IN TABLESPACE regress_tblspace_renamed SET TABLESPACE pg_default; +ERROR: tablespace "regress_tblspace_renamed" does not exist +ALTER INDEX ALL IN TABLESPACE regress_tblspace_renamed SET TABLESPACE pg_default; +ERROR: tablespace "regress_tblspace_renamed" does not exist +ALTER MATERIALIZED VIEW ALL IN TABLESPACE regress_tblspace_renamed SET TABLESPACE pg_default; +ERROR: tablespace "regress_tblspace_renamed" does not exist +-- Should show notice that nothing was done +ALTER TABLE ALL IN TABLESPACE regress_tblspace_renamed SET TABLESPACE pg_default; +ERROR: tablespace "regress_tblspace_renamed" does not exist +ALTER MATERIALIZED VIEW ALL IN TABLESPACE regress_tblspace_renamed SET TABLESPACE pg_default; +ERROR: tablespace "regress_tblspace_renamed" does not exist +-- Should succeed +DROP TABLESPACE regress_tblspace_renamed; +ERROR: tablespace "regress_tblspace_renamed" does not exist +DROP SCHEMA testschema CASCADE; +NOTICE: drop cascades to 7 other objects +DETAIL: drop cascades to table testschema.foo +drop cascades to table testschema.asselect +drop cascades to table testschema.asexecute +drop cascades to table testschema.part +drop cascades to table testschema.atable +drop cascades to materialized view testschema.amv +drop cascades to table testschema.tablespace_acl +DROP ROLE regress_tablespace_user1; +DROP ROLE regress_tablespace_user2; From 3cc313c19d5cbc60f2b1ded5a1cf59ffca3005d1 Mon Sep 17 00:00:00 2001 From: Matthias van de Meent Date: Tue, 8 Jul 2025 15:38:01 +0200 Subject: [PATCH 36/50] NEON: Add Neon's 2 new boot modes to the Postgres binary Specifically, --sync-safekeepers and --wal-redo --- src/backend/main/main.c | 43 +++++++++++++++++++++++++++++ src/include/postmaster/postmaster.h | 2 ++ 2 files changed, 45 insertions(+) diff --git a/src/backend/main/main.c b/src/backend/main/main.c index 7d63cf94a6b..edeffa6804c 100644 --- a/src/backend/main/main.c +++ b/src/backend/main/main.c @@ -52,6 +52,8 @@ static const char *const DispatchOptionNames[] = [DISPATCH_FORKCHILD] = "forkchild", [DISPATCH_DESCRIBE_CONFIG] = "describe-config", [DISPATCH_SINGLE] = "single", + [DISPATCH_NEON_WALREDO] = "wal-redo", + [DISPATCH_NEON_SYNCWAL] = "sync-safekeepers", /* DISPATCH_POSTMASTER has no name */ }; @@ -63,6 +65,41 @@ static void init_locale(const char *categoryname, int category, const char *loca static void help(const char *progname); static void check_root(const char *progname); +typedef int (*MainFunc) (int argc, char *argv[]); + +static int +CallExtMain(char *library_name, char *main_func_name, int argc, char *argv[], bool load_config) +{ + MainFunc main_func; + + /* + * Perform just enough initialization that we can load external libraries + */ + InitStandaloneProcess(argv[0]); + + SetProcessingMode(InitProcessing); + + /* + * Set default values for command-line options. + */ + InitializeGUCOptions(); + + /* Acquire configuration parameters */ + if (load_config && !SelectConfigFiles(NULL, progname)) + exit(1); + + /* + * Imitate we are early in bootstrap loading shared_preload_libraries; + * neon extension sets PGC_POSTMASTER gucs requiring this. + */ + process_shared_preload_libraries_in_progress = true; + + main_func = load_external_function(library_name, main_func_name, true, NULL); + + process_shared_preload_libraries_in_progress = false; + + return main_func(argc, argv); +} /* * Any Postgres server process begins execution here. @@ -223,6 +260,12 @@ main(int argc, char *argv[]) PostgresSingleUserMain(argc, argv, strdup(get_user_name_or_exit(progname))); break; + case DISPATCH_NEON_WALREDO: + CallExtMain("neon_walredo", "WalRedoMain", argc, argv, false); + break; + case DISPATCH_NEON_SYNCWAL: + CallExtMain("neon", "WalProposerSync", argc, argv, true); + break; case DISPATCH_POSTMASTER: PostmasterMain(argc, argv); break; diff --git a/src/include/postmaster/postmaster.h b/src/include/postmaster/postmaster.h index 92497cd6a0f..01ef3418551 100644 --- a/src/include/postmaster/postmaster.h +++ b/src/include/postmaster/postmaster.h @@ -137,6 +137,8 @@ typedef enum DispatchOption DISPATCH_FORKCHILD, DISPATCH_DESCRIBE_CONFIG, DISPATCH_SINGLE, + DISPATCH_NEON_WALREDO, + DISPATCH_NEON_SYNCWAL, DISPATCH_POSTMASTER, /* must be last */ } DispatchOption; From d1ed4ad76e3f97876a96d2420d6f9244ba1aa16c Mon Sep 17 00:00:00 2001 From: Matthias van de Meent Date: Tue, 8 Jul 2025 15:40:54 +0200 Subject: [PATCH 37/50] NEON: Allow background workers to be handled as WAL Sender process This makes neon's WalProposer work. --- src/backend/postmaster/postmaster.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c index e01d9f0cfe8..bb799eb9def 100644 --- a/src/backend/postmaster/postmaster.c +++ b/src/backend/postmaster/postmaster.c @@ -3500,6 +3500,18 @@ SignalChildren(int signal, BackendTypeMask targetMask) bp->bkend_type = B_WAL_SENDER; } + /* + * If we need to distinguish between B_BG_WORKER and B_WAL_SENDER, + * check if any B_BG_WORKER backends have recently announced that + * they are actually WAL senders. + */ + if (btmask_contains(targetMask, B_WAL_SENDER) != btmask_contains(targetMask, B_BG_WORKER) && + bp->bkend_type == B_BG_WORKER) + { + if (IsPostmasterChildWalSender(bp->child_slot)) + bp->bkend_type = B_WAL_SENDER; + } + if (!btmask_contains(targetMask, bp->bkend_type)) continue; @@ -3934,6 +3946,18 @@ CountChildren(BackendTypeMask targetMask) bp->bkend_type = B_WAL_SENDER; } + /* + * If we need to distinguish between B_BG_WORKER and B_WAL_SENDER, + * check if any B_BG_WORKER backends have recently announced that + * they are actually WAL senders. + */ + if (btmask_contains(targetMask, B_WAL_SENDER) != btmask_contains(targetMask, B_BG_WORKER) && + bp->bkend_type == B_BG_WORKER) + { + if (IsPostmasterChildWalSender(bp->child_slot)) + bp->bkend_type = B_WAL_SENDER; + } + if (!btmask_contains(targetMask, bp->bkend_type)) continue; From 74c1079d5abaab9dccf1a9a01f60e9918a06ff28 Mon Sep 17 00:00:00 2001 From: Matthias van de Meent Date: Tue, 8 Jul 2025 15:44:26 +0200 Subject: [PATCH 38/50] NEON: Mark as custom "Neon Postgres" ABI Used to ensure extensions aren't compatible-by-default between Neon and vanilla PostgreSQL - there are some important differences. --- src/include/pg_config_manual.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/include/pg_config_manual.h b/src/include/pg_config_manual.h index 125d3eb5fff..54fb3175cb0 100644 --- a/src/include/pg_config_manual.h +++ b/src/include/pg_config_manual.h @@ -57,7 +57,7 @@ * version. Example: "ACME Postgres/1.2". Note that the string will appear * in a user-facing error message if an ABI mismatch is detected. */ -#define FMGR_ABI_EXTRA "PostgreSQL" +#define FMGR_ABI_EXTRA "Neon Postgres" /* * Maximum number of columns in an index. There is little point in making From 494e00e7fe428d7ad7744f359eeeb77f7b5ec13d Mon Sep 17 00:00:00 2001 From: Matthias van de Meent Date: Tue, 8 Jul 2025 15:46:44 +0200 Subject: [PATCH 39/50] NEON: Add --with-libseccomp configure option Not really used in PostgreSQL, but used for wal-redo in Pageserver. --- configure | 87 ++++++++++++++++++++++++++++++++++++++ configure.ac | 13 ++++++ src/Makefile.global.in | 1 + src/include/pg_config.h.in | 3 ++ 4 files changed, 104 insertions(+) diff --git a/configure b/configure index ed5f9cafb4f..91c3f305930 100755 --- a/configure +++ b/configure @@ -723,6 +723,7 @@ LIBURING_CFLAGS with_liburing with_readline with_systemd +with_libseccomp with_selinux with_ldap with_krb_srvnam @@ -871,6 +872,7 @@ with_bsd_auth with_ldap with_bonjour with_selinux +with_libseccomp with_systemd with_readline with_libedit_preferred @@ -1589,6 +1591,7 @@ Optional Packages: --with-ldap build with LDAP support --with-bonjour build with Bonjour support --with-selinux build with SELinux support + --with-libseccomp build with libseccomp support --with-systemd build with systemd support --without-readline do not use GNU Readline nor BSD Libedit for editing --with-libedit-preferred @@ -8614,6 +8617,39 @@ fi { $as_echo "$as_me:${as_lineno-$LINENO}: result: $with_selinux" >&5 $as_echo "$with_selinux" >&6; } +# +# libseccomp +# +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether to build with libseccomp support" >&5 +$as_echo_n "checking whether to build with libseccomp support... " >&6; } + + + +# Check whether --with-libseccomp was given. +if test "${with_libseccomp+set}" = set; then : + withval=$with_libseccomp; + case $withval in + yes) + : + ;; + no) + : + ;; + *) + as_fn_error $? "no argument expected for --with-libseccomp option" "$LINENO" 5 + ;; + esac + +else + with_libseccomp=no + +fi + + + +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $with_libseccomp" >&5 +$as_echo "$with_libseccomp" >&6; } + # # Systemd # @@ -13661,6 +13697,57 @@ else as_fn_error $? "library 'libselinux', version 2.1.10 or newer, is required for SELinux support" "$LINENO" 5 fi + +fi + +if test "$with_libseccomp" = yes ; then + { $as_echo "$as_me:${as_lineno-$LINENO}: checking for seccomp_init in -lseccomp" >&5 +$as_echo_n "checking for seccomp_init in -lseccomp... " >&6; } +if ${ac_cv_lib_seccomp_seccomp_init+:} false; then : + $as_echo_n "(cached) " >&6 +else + ac_check_lib_save_LIBS=$LIBS +LIBS="-lseccomp $LIBS" +cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +/* Override any GCC internal prototype to avoid an error. + Use char because int might match the return type of a GCC + builtin and then its argument prototype would still apply. */ +#ifdef __cplusplus +extern "C" +#endif +char seccomp_init (); +int +main () +{ +return seccomp_init (); + ; + return 0; +} +_ACEOF +if ac_fn_c_try_link "$LINENO"; then : + ac_cv_lib_seccomp_seccomp_init=yes +else + ac_cv_lib_seccomp_seccomp_init=no +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext +LIBS=$ac_check_lib_save_LIBS +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_seccomp_seccomp_init" >&5 +$as_echo "$ac_cv_lib_seccomp_seccomp_init" >&6; } +if test "x$ac_cv_lib_seccomp_seccomp_init" = xyes; then : + cat >>confdefs.h <<_ACEOF +#define HAVE_LIBSECCOMP 1 +_ACEOF + + LIBS="-lseccomp $LIBS" + +else + as_fn_error $? "library 'libseccomp' is required for Seccomp BPF support" "$LINENO" 5 +fi + fi # for contrib/uuid-ossp diff --git a/configure.ac b/configure.ac index 587772d2b93..391e6749cf9 100644 --- a/configure.ac +++ b/configure.ac @@ -943,6 +943,14 @@ PGAC_ARG_BOOL(with, selinux, no, [build with SELinux support]) AC_SUBST(with_selinux) AC_MSG_RESULT([$with_selinux]) +# +# libseccomp +# +AC_MSG_CHECKING([whether to build with libseccomp support]) +PGAC_ARG_BOOL(with, libseccomp, no, [build with libseccomp support]) +AC_SUBST(with_libseccomp) +AC_MSG_RESULT([$with_libseccomp]) + # # Systemd # @@ -1638,6 +1646,11 @@ dnl If you want to use Apple's own Bonjour code on another platform, dnl just add -ldns_sd to LIBS manually. fi +if test "$with_libseccomp" = yes ; then + AC_CHECK_LIB(seccomp, seccomp_init, [], + [AC_MSG_ERROR([library 'libseccomp' is required for Seccomp BPF support])]) +fi + # for contrib/uuid-ossp if test "$with_uuid" = bsd ; then AC_CHECK_HEADERS(uuid.h, diff --git a/src/Makefile.global.in b/src/Makefile.global.in index 8b1b357beaa..790d9cf058e 100644 --- a/src/Makefile.global.in +++ b/src/Makefile.global.in @@ -191,6 +191,7 @@ with_tcl = @with_tcl@ with_ssl = @with_ssl@ with_readline = @with_readline@ with_selinux = @with_selinux@ +with_libseccomp = @with_libseccomp@ with_systemd = @with_systemd@ with_gssapi = @with_gssapi@ with_krb_srvnam = @with_krb_srvnam@ diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in index c4dc5d72bdb..ba0d6f12a54 100644 --- a/src/include/pg_config.h.in +++ b/src/include/pg_config.h.in @@ -265,6 +265,9 @@ /* Define if you have a function readline library */ #undef HAVE_LIBREADLINE +/* Define to 1 if you have the `seccomp' library (-lseccomp). */ +#undef HAVE_LIBSECCOMP + /* Define to 1 if you have the `selinux' library (-lselinux). */ #undef HAVE_LIBSELINUX From de2312371dacf89ae4c8522f75b017a9bcd50057 Mon Sep 17 00:00:00 2001 From: Matthias van de Meent Date: Tue, 8 Jul 2025 15:49:43 +0200 Subject: [PATCH 40/50] NEON: Expose WalSndWaitForWal for use in Neon's walproposer --- src/backend/replication/walsender.c | 5 ++--- src/include/replication/walsender.h | 3 +++ 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c index 7c93d92d819..d5a21e34597 100644 --- a/src/backend/replication/walsender.c +++ b/src/backend/replication/walsender.c @@ -275,7 +275,6 @@ static void WalSndPrepareWrite(LogicalDecodingContext *ctx, XLogRecPtr lsn, Tran static void WalSndWriteData(LogicalDecodingContext *ctx, XLogRecPtr lsn, TransactionId xid, bool last_write); static void WalSndUpdateProgress(LogicalDecodingContext *ctx, XLogRecPtr lsn, TransactionId xid, bool skipped_xact); -static XLogRecPtr WalSndWaitForWal(XLogRecPtr loc); static bool TransactionIdInRecentPast(TransactionId xid, uint32 epoch); static void WalSndSegmentOpen(XLogReaderState *state, XLogSegNo nextSegNo, @@ -3533,8 +3532,8 @@ WalSndDone(WalSndSendDataCallback send_data) * flush location if valid, write otherwise. Tools like pg_receivewal will * usually (unless in synchronous mode) return an invalid flush location. */ - replicatedPtr = XLogRecPtrIsInvalid(MyWalSnd->flush) ? - MyWalSnd->write : MyWalSnd->flush; + /* NEON: Neon uses flush_lsn to pass extra payload, so use write_lsn here */ + replicatedPtr = MyWalSnd->write; if (WalSndCaughtUp && sentPtr == replicatedPtr && !pq_is_send_pending()) diff --git a/src/include/replication/walsender.h b/src/include/replication/walsender.h index 8b475261a6d..2ca1b36d780 100644 --- a/src/include/replication/walsender.h +++ b/src/include/replication/walsender.h @@ -71,6 +71,9 @@ extern void ProcessStandbyHSFeedback(TimestampTz replyTime, TransactionId feedbackCatalogXmin, uint32 feedbackCatalogEpoch); +/* exposed for WALSender */ +extern XLogRecPtr WalSndWaitForWal(XLogRecPtr loc); + /* * Remember that we want to wakeup walsenders later * From 0eecd497f09271e6810c4fc63dcb365e3fde29a9 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Thu, 19 Sep 2024 09:34:35 +0300 Subject: [PATCH 41/50] NEON: Allow HOT replica to continue replay after GUC mismatches Instead we'll crash and restart when this happens, allowing a generally better experience for normal users which don't get so close to the limits that they would cause issues. --- src/backend/access/transam/xlogrecovery.c | 19 ++++++++++++++++++- src/include/access/xlogrecovery.h | 1 + 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/src/backend/access/transam/xlogrecovery.c b/src/backend/access/transam/xlogrecovery.c index c34d0a6dfc6..376317597e3 100644 --- a/src/backend/access/transam/xlogrecovery.c +++ b/src/backend/access/transam/xlogrecovery.c @@ -92,6 +92,7 @@ TimestampTz recoveryTargetTime; const char *recoveryTargetName; XLogRecPtr recoveryTargetLSN; int recovery_min_apply_delay = 0; +bool allowReplicaMisconfig = true; /* NEON: GUC is defined in neon extension */ /* options formerly taken from recovery.conf for XLOG streaming */ char *PrimaryConnInfo = NULL; @@ -4868,6 +4869,12 @@ RecoveryRequiresIntParameter(const char *param_name, int currValue, int minValue currValue, minValue))); + if (allowReplicaMisconfig) + { + /* Сontinue replication even though it can cause problems later */ + return; + } + SetRecoveryPause(true); ereport(LOG, @@ -4916,7 +4923,17 @@ RecoveryRequiresIntParameter(const char *param_name, int currValue, int minValue ConditionVariableCancelSleep(); } - ereport(FATAL, + if (allowReplicaMisconfig) + ereport(WARNING, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("Insufficient parameter settings can cause problems during recovery"), + /* Repeat the detail from above so it's easy to find in the log. */ + errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.", + param_name, + currValue, + minValue))); + else + ereport(FATAL, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("recovery aborted because of insufficient parameter settings"), /* Repeat the detail from above so it's easy to find in the log. */ diff --git a/src/include/access/xlogrecovery.h b/src/include/access/xlogrecovery.h index 6edef021f74..526dab87eb1 100644 --- a/src/include/access/xlogrecovery.h +++ b/src/include/access/xlogrecovery.h @@ -57,6 +57,7 @@ extern PGDLLIMPORT char *PrimarySlotName; extern PGDLLIMPORT char *recoveryRestoreCommand; extern PGDLLIMPORT char *recoveryEndCommand; extern PGDLLIMPORT char *archiveCleanupCommand; +extern PGDLLIMPORT bool allowReplicaMisconfig; /* indirectly set via GUC system */ extern PGDLLIMPORT TransactionId recoveryTargetXid; From 211af09c64c6a98e8bc27e2e0bea254518dbe1da Mon Sep 17 00:00:00 2001 From: Dimitri Fontaine Date: Wed, 16 Jul 2025 14:47:31 +0200 Subject: [PATCH 42/50] NEON: Add a databricks auth hook Used by Databricks' authentication system to authenticate their users. --- src/backend/libpq/auth.c | 52 +++++++++++++++++++++++++++++++++++----- src/include/libpq/auth.h | 10 ++++++++ 2 files changed, 56 insertions(+), 6 deletions(-) diff --git a/src/backend/libpq/auth.c b/src/backend/libpq/auth.c index 4da46666439..b4ac7f3e3a6 100644 --- a/src/backend/libpq/auth.c +++ b/src/backend/libpq/auth.c @@ -222,6 +222,11 @@ static int PerformRadiusTransaction(const char *server, const char *secret, cons */ ClientAuthentication_hook_type ClientAuthentication_hook = NULL; +/* + * This hook calls back Databricks CP for authentication + */ +DatabricksAuthentication_hook_type DatabricksAuthentication_hook = NULL; + /* * Tell the user the authentication failed, but not (much about) why. * @@ -791,23 +796,58 @@ CheckPasswordAuth(Port *port, const char **logdetail) int result; char *shadow_pass; + /* BEGIN HADRON */ + + /* + * this flag is passed to databricks auth hook and is updated by the hook + * to false if we should continue with password auth. This is by default + * true so that we don't accidentally do password auth if there is some + * bug in the hook. It's better to rely on the hook to set it explicitly + * false to continue with password auth. + */ + bool skip_password_auth = true; + + /* END HADRON */ + sendAuthRequest(port, AUTH_REQ_PASSWORD, NULL, 0); passwd = recv_password_packet(port); if (passwd == NULL) return STATUS_EOF; /* client wouldn't send password */ - shadow_pass = get_role_password(port->user_name, logdetail); - if (shadow_pass) + /* BEGIN HADRON */ + elog(DEBUG1, "Databricks: before authentication hook"); + + if (DatabricksAuthentication_hook) { - result = plain_crypt_verify(port->user_name, shadow_pass, passwd, - logdetail); + result = (*DatabricksAuthentication_hook) (port, passwd, &skip_password_auth, logdetail); } else + { + /* If hook is not set, do the password auth by default */ + skip_password_auth = false; result = STATUS_ERROR; + } - if (shadow_pass) - pfree(shadow_pass); + elog(DEBUG1, "Databricks: after authentication hook"); + + /* only try PG password auth if the hook didn't return STATUS_OK and */ + /* the hook set the skip_password_auth flag to false */ + if (result != STATUS_OK && !skip_password_auth) + { + shadow_pass = get_role_password(port->user_name, logdetail); + if (shadow_pass) + { + result = plain_crypt_verify(port->user_name, shadow_pass, passwd, + logdetail); + } + else + result = STATUS_ERROR; + + if (shadow_pass) + pfree(shadow_pass); + } + /* END HADRON */ pfree(passwd); if (result == STATUS_OK) diff --git a/src/include/libpq/auth.h b/src/include/libpq/auth.h index cc9643cce2f..0b154a09bfc 100644 --- a/src/include/libpq/auth.h +++ b/src/include/libpq/auth.h @@ -51,4 +51,14 @@ typedef char *(*auth_password_hook_typ) (char *input); /* Default LDAP password mutator hook, can be overridden by a shared library */ extern PGDLLIMPORT auth_password_hook_typ ldap_password_hook; +/* Hook for databricks authentication + * returns STATUS_OK on success, STATUS_ERROR on failure + * skip_passwd_auth is set to true/false if password authentication should be tried or not on STATUS_ERROR + * */ +typedef int (*DatabricksAuthentication_hook_type) (Port *port, + const char *passwd, + bool *skip_passwd_auth, + const char **logdetail); +extern PGDLLIMPORT DatabricksAuthentication_hook_type DatabricksAuthentication_hook; + #endif /* AUTH_H */ From 82f85f013e916d8c55c92adb6ed051606220b2a5 Mon Sep 17 00:00:00 2001 From: Matthias van de Meent Date: Thu, 24 Jul 2025 00:00:28 +0200 Subject: [PATCH 43/50] NEON: Stash work to make PG18 work well enough Changes include: - Removing smgr_[start/finish/end]_unlogged_build from f_smgr, making it a set of hooks. - Renaming smgr_read_slru_segment, and making it a hook The smgrrelation was never used, so the API didn't make much sense. - Auto-adjust smgr relation persistence based on buffer pin levels. This allows the smgr->which method to work better than it did before. - Adjust some places' passing of relpersistence, from rel->rd_rel->relpersistence, to smgr->relpersistence. This enables Neon's hot-switching of relpersistence - Add relpersistence to PendingRelDelete Also allow this setting to be changed on-the-fly; again, enabling Neon to do local buffered index builds. - Change buffers that are getting logged to BM_PERMANENT Again, locally buffered index builds - "Fix" various HeapAM redo methods We can't really read buffers yet in PG18 in wal-redo postgres, so we try to avoid that here. TODO: Find a proper solution to the WAL-redo PostgreSQL stuff - we're currently really screwed with the AIO system; it adds a lot of complexity in a system we'd prefer to keep as simple as possible. Maybe now is the time to finally improve our walredo process to stop relying on main PG's buffer infra and link in our own bufmgr.c code? --- src/backend/access/hash/hash_xlog.c | 7 +- src/backend/access/heap/heapam_xlog.c | 28 ++--- src/backend/access/transam/slru.c | 2 +- src/backend/access/transam/xloginsert.c | 42 ++++++++ src/backend/access/transam/xlogreader.c | 35 ++++++- src/backend/access/transam/xlogutils.c | 3 + src/backend/catalog/storage.c | 27 ++++- src/backend/storage/buffer/bufmgr.c | 50 ++++++--- src/backend/storage/ipc/ipci.c | 3 + src/backend/storage/smgr/smgr.c | 102 ++++++++++++++----- src/backend/utils/activity/pgstat_database.c | 3 + src/include/catalog/storage.h | 4 + src/include/miscadmin.h | 2 +- src/include/storage/smgr.h | 21 ++-- src/test/regress/expected/tablespace_1.out | 18 ++-- 15 files changed, 274 insertions(+), 73 deletions(-) diff --git a/src/backend/access/hash/hash_xlog.c b/src/backend/access/hash/hash_xlog.c index 8d97067fe54..76434b4a0e2 100644 --- a/src/backend/access/hash/hash_xlog.c +++ b/src/backend/access/hash/hash_xlog.c @@ -18,6 +18,7 @@ #include "access/hash.h" #include "access/hash_xlog.h" #include "access/xlogutils.h" +#include "miscadmin.h" #include "storage/standby.h" /* @@ -49,7 +50,7 @@ hash_xlog_init_meta_page(XLogReaderState *record) * full page image of the metapage. */ XLogRecGetBlockTag(record, 0, NULL, &forknum, NULL); - if (forknum == INIT_FORKNUM) + if (forknum == INIT_FORKNUM && !am_wal_redo_postgres) FlushOneBuffer(metabuf); /* all done */ @@ -87,7 +88,7 @@ hash_xlog_init_bitmap_page(XLogReaderState *record) * full page image of the metapage. */ XLogRecGetBlockTag(record, 0, NULL, &forknum, NULL); - if (forknum == INIT_FORKNUM) + if (forknum == INIT_FORKNUM && !am_wal_redo_postgres) FlushOneBuffer(bitmapbuf); UnlockReleaseBuffer(bitmapbuf); @@ -111,7 +112,7 @@ hash_xlog_init_bitmap_page(XLogReaderState *record) MarkBufferDirty(metabuf); XLogRecGetBlockTag(record, 1, NULL, &forknum, NULL); - if (forknum == INIT_FORKNUM) + if (forknum == INIT_FORKNUM && !am_wal_redo_postgres) FlushOneBuffer(metabuf); } if (BufferIsValid(metabuf)) diff --git a/src/backend/access/heap/heapam_xlog.c b/src/backend/access/heap/heapam_xlog.c index a201efceb5e..98b9b8587f8 100644 --- a/src/backend/access/heap/heapam_xlog.c +++ b/src/backend/access/heap/heapam_xlog.c @@ -19,6 +19,7 @@ #include "access/visibilitymap.h" #include "access/xlog.h" #include "access/xlogutils.h" +#include "miscadmin.h" #include "storage/freespace.h" #include "storage/standby.h" @@ -157,7 +158,7 @@ heap_xlog_prune_freeze(XLogReaderState *record) { if (xlrec.flags & (XLHP_HAS_REDIRECTIONS | XLHP_HAS_DEAD_ITEMS | - XLHP_HAS_NOW_UNUSED_ITEMS)) + XLHP_HAS_NOW_UNUSED_ITEMS) && !am_wal_redo_postgres) { Size freespace = PageGetHeapFreeSpace(BufferGetPage(buffer)); @@ -273,7 +274,7 @@ heap_xlog_visible(XLogReaderState *record) * Do this regardless of a full-page image being applied, since the * FSM data is not in the page anyway. */ - if (xlrec->flags & VISIBILITYMAP_VALID_BITS) + if (xlrec->flags & VISIBILITYMAP_VALID_BITS && !am_wal_redo_postgres) XLogRecordPageWithFreeSpace(rlocator, blkno, space); } @@ -369,7 +370,7 @@ heap_xlog_delete(XLogReaderState *record) * The visibility map may need to be fixed even if the heap page is * already up-to-date. */ - if (xlrec->flags & XLH_DELETE_ALL_VISIBLE_CLEARED) + if (xlrec->flags & XLH_DELETE_ALL_VISIBLE_CLEARED && !am_wal_redo_postgres) { Relation reln = CreateFakeRelcacheEntry(target_locator); Buffer vmbuffer = InvalidBuffer; @@ -456,7 +457,7 @@ heap_xlog_insert(XLogReaderState *record) * The visibility map may need to be fixed even if the heap page is * already up-to-date. */ - if (xlrec->flags & XLH_INSERT_ALL_VISIBLE_CLEARED) + if (xlrec->flags & XLH_INSERT_ALL_VISIBLE_CLEARED && !am_wal_redo_postgres) { Relation reln = CreateFakeRelcacheEntry(target_locator); Buffer vmbuffer = InvalidBuffer; @@ -536,7 +537,8 @@ heap_xlog_insert(XLogReaderState *record) * don't bother to update the FSM in that case, it doesn't need to be * totally accurate anyway. */ - if (action == BLK_NEEDS_REDO && freespace < BLCKSZ / 5) + if (action == BLK_NEEDS_REDO && freespace < BLCKSZ / 5 + && !am_wal_redo_postgres) XLogRecordPageWithFreeSpace(target_locator, blkno, freespace); } @@ -580,7 +582,7 @@ heap_xlog_multi_insert(XLogReaderState *record) * The visibility map may need to be fixed even if the heap page is * already up-to-date. */ - if (xlrec->flags & XLH_INSERT_ALL_VISIBLE_CLEARED) + if (xlrec->flags & XLH_INSERT_ALL_VISIBLE_CLEARED && !am_wal_redo_postgres) { Relation reln = CreateFakeRelcacheEntry(rlocator); Buffer vmbuffer = InvalidBuffer; @@ -683,7 +685,8 @@ heap_xlog_multi_insert(XLogReaderState *record) * don't bother to update the FSM in that case, it doesn't need to be * totally accurate anyway. */ - if (action == BLK_NEEDS_REDO && freespace < BLCKSZ / 5) + if (action == BLK_NEEDS_REDO && freespace < BLCKSZ / 5 + && !am_wal_redo_postgres) XLogRecordPageWithFreeSpace(rlocator, blkno, freespace); } @@ -739,7 +742,7 @@ heap_xlog_update(XLogReaderState *record, bool hot_update) * The visibility map may need to be fixed even if the heap page is * already up-to-date. */ - if (xlrec->flags & XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED) + if (xlrec->flags & XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED && !am_wal_redo_postgres) { Relation reln = CreateFakeRelcacheEntry(rlocator); Buffer vmbuffer = InvalidBuffer; @@ -823,7 +826,7 @@ heap_xlog_update(XLogReaderState *record, bool hot_update) * The visibility map may need to be fixed even if the heap page is * already up-to-date. */ - if (xlrec->flags & XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED) + if (xlrec->flags & XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED && !am_wal_redo_postgres) { Relation reln = CreateFakeRelcacheEntry(rlocator); Buffer vmbuffer = InvalidBuffer; @@ -958,7 +961,8 @@ heap_xlog_update(XLogReaderState *record, bool hot_update) * don't bother to update the FSM in that case, it doesn't need to be * totally accurate anyway. */ - if (newaction == BLK_NEEDS_REDO && !hot_update && freespace < BLCKSZ / 5) + if (newaction == BLK_NEEDS_REDO && !hot_update && freespace < BLCKSZ / 5 + && !am_wal_redo_postgres) XLogRecordPageWithFreeSpace(rlocator, newblk, freespace); } @@ -1019,7 +1023,7 @@ heap_xlog_lock(XLogReaderState *record) * The visibility map may need to be fixed even if the heap page is * already up-to-date. */ - if (xlrec->flags & XLH_LOCK_ALL_FROZEN_CLEARED) + if (xlrec->flags & XLH_LOCK_ALL_FROZEN_CLEARED && !am_wal_redo_postgres) { RelFileLocator rlocator; Buffer vmbuffer = InvalidBuffer; @@ -1095,7 +1099,7 @@ heap_xlog_lock_updated(XLogReaderState *record) * The visibility map may need to be fixed even if the heap page is * already up-to-date. */ - if (xlrec->flags & XLH_LOCK_ALL_FROZEN_CLEARED) + if (xlrec->flags & XLH_LOCK_ALL_FROZEN_CLEARED && !am_wal_redo_postgres) { RelFileLocator rlocator; Buffer vmbuffer = InvalidBuffer; diff --git a/src/backend/access/transam/slru.c b/src/backend/access/transam/slru.c index 2cc3b962ee5..8dae1c48df9 100644 --- a/src/backend/access/transam/slru.c +++ b/src/backend/access/transam/slru.c @@ -745,7 +745,7 @@ SimpleLruDownloadSegment(SlruCtl ctl, int pageno, char const* path) segno = pageno / SLRU_PAGES_PER_SEGMENT; buffer = palloc(BLCKSZ * SLRU_PAGES_PER_SEGMENT); - n_blocks = smgr_read_slru_segment(path, segno, buffer); + n_blocks = read_slru_segment(path, segno, buffer); if (n_blocks > 0) { fd = OpenTransientFile(path, O_RDWR | O_CREAT | PG_BINARY); diff --git a/src/backend/access/transam/xloginsert.c b/src/backend/access/transam/xloginsert.c index 36a1e3d7e7e..ecc16ae79fd 100644 --- a/src/backend/access/transam/xloginsert.c +++ b/src/backend/access/transam/xloginsert.c @@ -35,6 +35,7 @@ #include "common/pg_lzcompress.h" #include "miscadmin.h" #include "pg_trace.h" +#include "storage/buf_internals.h" #include "replication/origin.h" #include "replication/walsender.h" #include "storage/bufmgr.h" @@ -1337,7 +1338,23 @@ log_newpage_range(Relation rel, ForkNumber forknum, if (!PageIsNew(BufferGetPage(buf))) bufpack[nbufs++] = buf; else + { + BufferDesc *bufdesc; + + bufdesc = GetBufferDescriptor(buf); + + /* only operate on operations on non-permanent buffers */ + if ((pg_atomic_read_u32(&bufdesc->state) & BM_PERMANENT) == 0) + { + /* We have the exclusive content lock on the page, so no concurrent + * writer can be accessing this page */ + uint32 buf_state = LockBufHdr(bufdesc); + buf_state |= BM_PERMANENT; + UnlockBufHdr(bufdesc, buf_state); + } + UnlockReleaseBuffer(buf); + } blkno++; } @@ -1357,6 +1374,31 @@ log_newpage_range(Relation rel, ForkNumber forknum, recptr = XLogInsert(RM_XLOG_ID, XLOG_FPI); + /* + * Mark the buffers as permanent, if they weren't already marked PERMANENT. + * + * Note that this does not race with concurrent writes, because those + * can not start while the page is exclusively locked by this backend. + */ + for (i = 0; i < nbufs; i++) + { + BufferDesc *bufdesc; + uint32 buf_state; + + bufdesc = GetBufferDescriptor(bufpack[i] - 1); + + /* skip operations on already-permanent buffers */ + if (pg_atomic_read_u32(&bufdesc->state) & BM_PERMANENT) + continue; + + /* we have the exclusive content lock on the page, so this is safe */ + buf_state = LockBufHdr(bufdesc); + buf_state |= BM_PERMANENT; + UnlockBufHdr(bufdesc, buf_state); + + Assert(pg_atomic_read_u32(&bufdesc->state) & BM_PERMANENT); + } + for (i = 0; i < nbufs; i++) { PageSetLSN(BufferGetPage(bufpack[i]), recptr); diff --git a/src/backend/access/transam/xlogreader.c b/src/backend/access/transam/xlogreader.c index cbe45044f72..ba148a795b1 100644 --- a/src/backend/access/transam/xlogreader.c +++ b/src/backend/access/transam/xlogreader.c @@ -80,6 +80,11 @@ report_invalid_record(XLogReaderState *state, const char *fmt,...) va_end(args); state->errormsg_deferred = true; + +#ifndef FRONTEND + ereport(PANIC, (errmsg_internal("%s", state->errormsg_buf), + errbacktrace())); +#endif } /* @@ -1364,10 +1369,11 @@ XLogReaderValidatePageHeader(XLogReaderState *state, XLogRecPtr recptr, /* hmm, first page of file doesn't have a long header? */ report_invalid_record(state, - "invalid info bits %04X in WAL segment %s, LSN %X/%X, offset %u", + "invalid info bits %04X in WAL segment %s, LSN %X/%X, page addr %X/%X, offset %u", hdr->xlp_info, fname, LSN_FORMAT_ARGS(recptr), + LSN_FORMAT_ARGS(hdr->xlp_pageaddr), offset); return false; } @@ -1578,10 +1584,26 @@ WALRead(XLogReaderState *state, instr_time io_start; #endif +#ifndef FRONTEND + elog(LOG, "WALRead xlog startptr=%X/%08X count=%d", + LSN_FORMAT_ARGS(startptr), (int) count); +#endif p = buf; recptr = startptr; nbytes = count; +#ifndef FRONTEND + if ((startptr & wal_segment_size - 1) < XLOG_BLCKSZ && (recptr & wal_segment_size - 1) > SizeOfXLogLongPHD) + { + XLogLongPageHeaderData lphdr; + int baseoff = startptr & wal_segment_size - 1; + + memcpy(&lphdr, &p[-baseoff], SizeOfXLogLongPHD); + Assert(lphdr.std.xlp_info & XLP_LONG_HEADER); + Assert(lphdr.std.xlp_info & XLP_FIRST_IS_CONTRECORD || lphdr.std.xlp_rem_len == 0); + } +#endif + while (nbytes > 0) { uint32 startoff; @@ -1655,6 +1677,17 @@ WALRead(XLogReaderState *state, p += readbytes; } +#ifndef FRONTEND + if ((recptr & wal_segment_size - 1) <= XLOG_BLCKSZ && (recptr & wal_segment_size - 1) > SizeOfXLogLongPHD) + { + XLogLongPageHeaderData lphdr; + int baseoff = recptr & wal_segment_size - 1; + + memcpy(&lphdr, &p[-baseoff], SizeOfXLogLongPHD); + Assert(lphdr.std.xlp_info & XLP_LONG_HEADER); + Assert(lphdr.std.xlp_info & XLP_FIRST_IS_CONTRECORD || lphdr.std.xlp_rem_len == 0); + } +#endif return true; } diff --git a/src/backend/access/transam/xlogutils.c b/src/backend/access/transam/xlogutils.c index 552cb92f3e6..21d4fa4afcd 100644 --- a/src/backend/access/transam/xlogutils.c +++ b/src/backend/access/transam/xlogutils.c @@ -862,6 +862,9 @@ int read_local_xlog_page(XLogReaderState *state, XLogRecPtr targetPagePtr, int reqLen, XLogRecPtr targetRecPtr, char *cur_page) { + elog(LOG, "Reading local xlog page=%X/%08X len=%d recptr=%X/%08X", + LSN_FORMAT_ARGS(targetPagePtr), reqLen, + LSN_FORMAT_ARGS(targetRecPtr)); return read_local_xlog_page_guts(state, targetPagePtr, reqLen, targetRecPtr, cur_page, true); } diff --git a/src/backend/catalog/storage.c b/src/backend/catalog/storage.c index 516e99ae236..c3601ce131f 100644 --- a/src/backend/catalog/storage.c +++ b/src/backend/catalog/storage.c @@ -64,6 +64,7 @@ typedef struct PendingRelDelete RelFileLocator rlocator; /* relation that may need to be deleted */ ProcNumber procNumber; /* INVALID_PROC_NUMBER if not a temp rel */ bool atCommit; /* T=delete at commit; F=delete at abort */ + char relpersistence; /* relation's relpersistence, for smgr ops */ int nestLevel; /* xact nesting level of request */ struct PendingRelDelete *next; /* linked-list link */ } PendingRelDelete; @@ -166,6 +167,7 @@ RelationCreateStorage(RelFileLocator rlocator, char relpersistence, pending->rlocator = rlocator; pending->procNumber = procNumber; pending->atCommit = false; /* delete if abort */ + pending->relpersistence = relpersistence; pending->nestLevel = GetCurrentTransactionNestLevel(); pending->next = pendingDeletes; pendingDeletes = pending; @@ -202,6 +204,23 @@ log_smgrcreate(const RelFileLocator *rlocator, ForkNumber forkNum) set_lwlsn_relation_hook(lsn, *rlocator, forkNum); } +void +RelationAdjustPendingDelete(RelFileLocator rlocator, ProcNumber procNumber, + char relpersistence) +{ + PendingRelDelete *pending = pendingDeletes; + while (pending != NULL) + { + if (RelFileLocatorEquals(pending->rlocator, rlocator) && + pending->procNumber == procNumber) + { + pending->relpersistence = relpersistence; + } + + pending = pending->next; + } +} + /* * RelationDropStorage * Schedule unlinking of physical storage at transaction commit. @@ -217,6 +236,7 @@ RelationDropStorage(Relation rel) pending->rlocator = rel->rd_locator; pending->procNumber = rel->rd_backend; pending->atCommit = true; /* delete if commit */ + pending->relpersistence = rel->rd_rel->relpersistence; /* delete if commit */ pending->nestLevel = GetCurrentTransactionNestLevel(); pending->next = pendingDeletes; pendingDeletes = pending; @@ -704,7 +724,8 @@ smgrDoPendingDeletes(bool isCommit) { SMgrRelation srel; - srel = smgropen(pending->rlocator, pending->procNumber, 0); + srel = smgropen(pending->rlocator, pending->procNumber, + pending->relpersistence); /* allocate the initial array, or extend it, if needed */ if (maxrels == 0) { @@ -784,7 +805,9 @@ smgrDoPendingSyncs(bool isCommit, bool isParallelWorker) uint64 total_blocks = 0; SMgrRelation srel; - srel = smgropen(pendingsync->rlocator, INVALID_PROC_NUMBER, 0); + /* We only have pending syncs for permanent relations */ + srel = smgropen(pendingsync->rlocator, INVALID_PROC_NUMBER, + RELPERSISTENCE_PERMANENT); /* * We emit newpage WAL records for smaller relations. diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 04531c14c7a..680a5274582 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -826,7 +826,8 @@ ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum, * Read the buffer, and update pgstat counters to reflect a cache hit or * miss. */ - buf = ReadBuffer_common(reln, RelationGetSmgr(reln), 0, + buf = ReadBuffer_common(reln, RelationGetSmgr(reln), + RelationGetSmgr(reln)->smgr_relpersistence, forkNum, blockNum, mode, strategy); return buf; @@ -908,7 +909,7 @@ ExtendBufferedRelBy(BufferManagerRelation bmr, if (bmr.smgr == NULL) { bmr.smgr = RelationGetSmgr(bmr.rel); - bmr.relpersistence = bmr.rel->rd_rel->relpersistence; + bmr.relpersistence = bmr.smgr->smgr_relpersistence; } return ExtendBufferedRelCommon(bmr, fork, strategy, flags, @@ -944,7 +945,7 @@ ExtendBufferedRelTo(BufferManagerRelation bmr, if (bmr.smgr == NULL) { bmr.smgr = RelationGetSmgr(bmr.rel); - bmr.relpersistence = bmr.rel->rd_rel->relpersistence; + bmr.relpersistence = bmr.smgr->smgr_relpersistence; } /* @@ -1234,7 +1235,7 @@ ReadBuffer_common(Relation rel, SMgrRelation smgr, char smgr_persistence, } if (rel) - persistence = rel->rd_rel->relpersistence; + persistence = smgr_persistence; else persistence = smgr_persistence; @@ -1922,6 +1923,14 @@ AsyncReadBuffers(ReadBuffersOperation *operation, int *nblocks_progress) { instr_time io_start; + if (operation->smgr->smgr_relpersistence != operation->persistence) + { + SMgrRelation reln = operation->smgr; + operation->smgr = smgropen(reln->smgr_rlocator.locator, + reln->smgr_rlocator.backend, + operation->persistence); + } + /* We found a buffer that we need to read in. */ Assert(io_buffers[0] == buffers[nblocks_done]); io_pages[0] = BufferGetBlock(buffers[nblocks_done]); @@ -4353,10 +4362,24 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object, /* Find smgr relation for buffer */ if (reln == NULL) + { + char relpersistence; + + if (io_object == IOOBJECT_TEMP_RELATION) + relpersistence = RELPERSISTENCE_TEMP; + else + { + Assert(io_object == IOOBJECT_RELATION); + if (pg_atomic_read_u32(&buf->state) & BM_PERMANENT) + relpersistence = RELPERSISTENCE_PERMANENT; + else + relpersistence = RELPERSISTENCE_UNLOGGED; + } + reln = smgropen(BufTagGetRelFileLocator(&buf->tag), INVALID_PROC_NUMBER, - io_object == IOOBJECT_RELATION ? - RELPERSISTENCE_PERMANENT : - RELPERSISTENCE_UNLOGGED); + relpersistence); + Assert(reln->smgr_relpersistence == relpersistence); + } TRACE_POSTGRESQL_BUFFER_FLUSH_START(BufTagGetForkNum(&buf->tag), buf->tag.blockNum, @@ -7233,13 +7256,16 @@ buffer_readv_complete(PgAioHandle *ioh, PgAioResult prior_result, uint64 *io_data; uint8 handle_data_len; - if (is_temp) + if (!am_wal_redo_postgres) { - Assert(td->smgr.relpersistence == RELPERSISTENCE_TEMP); - Assert(pgaio_io_get_owner(ioh) == MyProcNumber); + if (is_temp) + { + Assert(td->smgr.relpersistence == RELPERSISTENCE_TEMP); + Assert(pgaio_io_get_owner(ioh) == MyProcNumber); + } + else + Assert(td->smgr.relpersistence != RELPERSISTENCE_TEMP); } - else - Assert(td->smgr.relpersistence != RELPERSISTENCE_TEMP); /* * Iterate over all the buffers affected by this IO and call the diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c index 2fa045e6b0f..c2d65308da2 100644 --- a/src/backend/storage/ipc/ipci.c +++ b/src/backend/storage/ipc/ipci.c @@ -37,7 +37,10 @@ #include "replication/slotsync.h" #include "replication/walreceiver.h" #include "replication/walsender.h" +#if PG_MAJORVERSION_NUM >= 18 #include "storage/aio_subsys.h" +#include "storage/aio.h" +#endif #include "storage/bufmgr.h" #include "storage/dsm.h" #include "storage/dsm_registry.h" diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c index 10aaaf82423..65aa21bbe18 100644 --- a/src/backend/storage/smgr/smgr.c +++ b/src/backend/storage/smgr/smgr.c @@ -97,15 +97,17 @@ static const f_smgr smgr_md = { .smgr_registersync = mdregistersync, .smgr_fd = mdfd, .smgr_owns = mdowns, - .smgr_start_unlogged_build = NULL, - .smgr_finish_unlogged_build_phase_1 = NULL, - .smgr_end_unlogged_build = NULL, }; static const f_smgr *smgrsw = &smgr_md; static int NSmgr = 1; +start_unlogged_build_hook_type start_unlogged_build_hook; +finish_unlogged_build_phase_1_hook_type finish_unlogged_build_phase_1_hook; +end_unlogged_build_hook_type end_unlogged_build_hook; +read_slru_segment_hook_type read_slru_segment_hook; + /* * Each backend has a hashtable that stores all extant SMgrRelation objects. * In addition, "unpinned" SMgrRelation objects are chained together in a list. @@ -132,12 +134,15 @@ SmgrId smgrregister(const f_smgr *smgr) { f_smgr *new_smgrsw; + Assert(smgr != NULL); + Assert(smgr->smgr_name != NULL); for (int i = 0; i < NSmgr; i++) { /* different name */ if (strcmp(smgr->smgr_name, smgrsw[i].smgr_name) != 0) continue; + /* exactly the same struct, return the previously-returned ID */ if (memcmp(smgr, &smgrsw[i], sizeof(f_smgr))) return i; @@ -230,6 +235,7 @@ smgropen(RelFileLocator rlocator, ProcNumber backend, char relpersistence) bool found; Assert(RelFileNumberIsValid(rlocator.relNumber)); + Assert(backend == INVALID_PROC_NUMBER || relpersistence == RELPERSISTENCE_TEMP); HOLD_INTERRUPTS(); @@ -285,21 +291,68 @@ smgropen(RelFileLocator rlocator, ProcNumber backend, char relpersistence) } else { + /* if no relpersistence was given, default to that of the existing smgrrel */ + if (relpersistence == 0) + relpersistence = reln->smgr_relpersistence; + /* - * Fix the persistence of the SMgrRelation if we didn't know it already. + * Update the persistence of the smgr. * - * If we knew the persistence already, make sure that it hasn't changed. + * If we need to change the owning smgr, we do that now, too. */ - if (reln->smgr_relpersistence == 0) + if (reln->smgr_relpersistence == 0 + || reln->smgr_relpersistence != relpersistence) { + SmgrId id = NSmgr; + + for (; id > 0; id--) + { + if (smgrsw[id - 1].smgr_owns(reln->smgr_rlocator.locator, + reln->smgr_rlocator.backend, + relpersistence)) + break; + } + + /* must have at least one SMGR which owns this reln */ + Assert(id > 0); + id--; + + if (id != reln->smgr_which) + { + for (int i = 0; i < MAX_FORKNUM; i++) + smgrsw[reln->smgr_which].smgr_close(reln, i); + + /* + * ereport(LOG, (errmsg_internal( + * "Changed relpersistence to '%c' from '%c', smgr to '%s' from '%s' (reln %u/%u/%u, %d pins)", + * relpersistence, reln->smgr_relpersistence, + * smgrsw[id].smgr_name, + * smgrsw[reln->smgr_which].smgr_name, + * reln->smgr_rlocator.locator.spcOid, + * reln->smgr_rlocator.locator.dbOid, + * reln->smgr_rlocator.locator.relNumber, + * reln->pincount + * ), errbacktrace())); + */ + + reln->smgr_which = id; + smgrsw[id].smgr_open(reln); + } + else + { + /* + * ereport(LOG, (errmsg_internal( + * "Changed relpersistence to '%c' from '%c' (reln %u/%u/%u, %d pins)", + * relpersistence, reln->smgr_relpersistence, + * reln->smgr_rlocator.locator.spcOid, + * reln->smgr_rlocator.locator.dbOid, + * reln->smgr_rlocator.locator.relNumber, + * reln->pincount + * ), errbacktrace())); + */ + } reln->smgr_relpersistence = relpersistence; } - else if (reln->smgr_relpersistence != 0 && relpersistence != 0 && - reln->smgr_relpersistence != relpersistence) - { - elog(ERROR, "relpersistence mismatch: smgropen %c vs SmgrRelation %c", - relpersistence, reln->smgr_relpersistence); - } } RESUME_INTERRUPTS(); @@ -1017,22 +1070,22 @@ smgrfd(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, uint32 *off) void smgr_start_unlogged_build(SMgrRelation reln) { - if (smgrsw[reln->smgr_which].smgr_start_unlogged_build) - smgrsw[reln->smgr_which].smgr_start_unlogged_build(reln); + if (start_unlogged_build_hook) + start_unlogged_build_hook(reln); } void smgr_finish_unlogged_build_phase_1(SMgrRelation reln) { - if (smgrsw[reln->smgr_which].smgr_finish_unlogged_build_phase_1) - smgrsw[reln->smgr_which].smgr_finish_unlogged_build_phase_1(reln); + if (finish_unlogged_build_phase_1_hook) + finish_unlogged_build_phase_1_hook(reln); } void smgr_end_unlogged_build(SMgrRelation reln) { - if (smgrsw[reln->smgr_which].smgr_end_unlogged_build) - smgrsw[reln->smgr_which].smgr_end_unlogged_build(reln); + if (end_unlogged_build_hook) + end_unlogged_build_hook(reln); } /* @@ -1048,17 +1101,10 @@ smgr_end_unlogged_build(SMgrRelation reln) * oh well. */ int -smgr_read_slru_segment(const char* path, int segno, void* buffer) +read_slru_segment(const char* path, int segno, void* buffer) { - for (int i = NSmgr; i > 0; i--) - { - SmgrId smgr = (i - 1); - if (smgrsw[smgr].smgr_read_slru_segment) - { - return smgrsw[smgr].smgr_read_slru_segment(path, segno, buffer); - } - } - + if (read_slru_segment_hook) + return read_slru_segment_hook(path, segno, buffer); return 0; } diff --git a/src/backend/utils/activity/pgstat_database.c b/src/backend/utils/activity/pgstat_database.c index b31f20d41bc..175b3d3d98c 100644 --- a/src/backend/utils/activity/pgstat_database.c +++ b/src/backend/utils/activity/pgstat_database.c @@ -145,6 +145,9 @@ void pgstat_prepare_report_checksum_failure(Oid dboid) { Assert(!CritSectionCount); + /* any checksum failures will have to happen out-of-line */ + if (am_wal_redo_postgres) + return; /* * Just need to ensure this backend has an entry ref for the database. diff --git a/src/include/catalog/storage.h b/src/include/catalog/storage.h index ba99225b0a3..60132ecca37 100644 --- a/src/include/catalog/storage.h +++ b/src/include/catalog/storage.h @@ -36,6 +36,10 @@ extern Size EstimatePendingSyncsSpace(void); extern void SerializePendingSyncs(Size maxSize, char *startAddress); extern void RestorePendingSyncs(char *startAddress); +extern void RelationAdjustPendingDelete(RelFileLocator rlocator, + ProcNumber procNumber, + char relpersistence); + /* * These functions used to be in storage/smgr/smgr.c, which explains the * naming diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h index 53db7708a6a..85092bc91d0 100644 --- a/src/include/miscadmin.h +++ b/src/include/miscadmin.h @@ -548,6 +548,6 @@ extern void RestoreClientConnectionInfo(char *conninfo); extern size_t get_hash_memory_limit(void); /* in storage/buffer/buf_init.c */ -extern bool am_wal_redo_postgres; +extern PGDLLIMPORT bool am_wal_redo_postgres; #endif /* MISCADMIN_H */ diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h index 8c66597dc17..a4c5e0b03ef 100644 --- a/src/include/storage/smgr.h +++ b/src/include/storage/smgr.h @@ -143,14 +143,21 @@ typedef struct f_smgr /* NEON: test if this smgr is responsible for this relation */ bool (*smgr_owns) (RelFileLocator rlocator, ProcNumber backend, char relpersistence); - - /* NEON: Hooks to smgr for unlogged build phases */ - void (*smgr_start_unlogged_build) (SMgrRelation reln); - void (*smgr_finish_unlogged_build_phase_1) (SMgrRelation reln); - void (*smgr_end_unlogged_build) (SMgrRelation reln); - int (*smgr_read_slru_segment) (const char *path, int segno, void* buffer); } f_smgr; +/* NEON: Hooks for unlogged build phases */ +typedef void (*start_unlogged_build_hook_type) (SMgrRelation reln); +typedef void (*finish_unlogged_build_phase_1_hook_type) (SMgrRelation reln); +typedef void (*end_unlogged_build_hook_type) (SMgrRelation reln); + +extern start_unlogged_build_hook_type start_unlogged_build_hook; +extern finish_unlogged_build_phase_1_hook_type finish_unlogged_build_phase_1_hook; +extern end_unlogged_build_hook_type end_unlogged_build_hook; + +/* NEON: Hook for reading an SLRU segment from e.g. remote storage */ +typedef int (*read_slru_segment_hook_type) (const char *path, int segno, void* buffer); +extern read_slru_segment_hook_type read_slru_segment_hook; + /* NEON: Alternative implementation of calculate_database_size(), to make it O(1) */ typedef int64 (*dbsize_hook_type) (Oid dbOid); extern PGDLLIMPORT dbsize_hook_type dbsize_hook; @@ -210,7 +217,7 @@ extern void smgr_finish_unlogged_build_phase_1(SMgrRelation reln); extern void smgr_end_unlogged_build(SMgrRelation reln); /* Neon: Allow on-demand download of SLRU segment data */ -extern int smgr_read_slru_segment(const char *path, int segno, void* buffer); +extern int read_slru_segment(const char *path, int segno, void* buffer); static inline void smgrread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, diff --git a/src/test/regress/expected/tablespace_1.out b/src/test/regress/expected/tablespace_1.out index 5b9cefa6057..2c25fd7309f 100644 --- a/src/test/regress/expected/tablespace_1.out +++ b/src/test/regress/expected/tablespace_1.out @@ -51,13 +51,13 @@ ERROR: cannot move system relation "pg_authid_rolname_index" REINDEX (TABLESPACE regress_tblspace) TABLE CONCURRENTLY pg_authid; ERROR: cannot reindex system catalogs concurrently -- toast relations, fail -REINDEX (TABLESPACE regress_tblspace) INDEX pg_toast.pg_toast_1260_index; -ERROR: cannot move system relation "pg_toast_1260_index" -REINDEX (TABLESPACE regress_tblspace) INDEX CONCURRENTLY pg_toast.pg_toast_1260_index; +REINDEX (TABLESPACE regress_tblspace) INDEX pg_toast.pg_toast_1262_index; +ERROR: cannot move system relation "pg_toast_1262_index" +REINDEX (TABLESPACE regress_tblspace) INDEX CONCURRENTLY pg_toast.pg_toast_1262_index; ERROR: cannot reindex system catalogs concurrently -REINDEX (TABLESPACE regress_tblspace) TABLE pg_toast.pg_toast_1260; -ERROR: cannot move system relation "pg_toast_1260_index" -REINDEX (TABLESPACE regress_tblspace) TABLE CONCURRENTLY pg_toast.pg_toast_1260; +REINDEX (TABLESPACE regress_tblspace) TABLE pg_toast.pg_toast_1262; +ERROR: cannot move system relation "pg_toast_1262_index" +REINDEX (TABLESPACE regress_tblspace) TABLE CONCURRENTLY pg_toast.pg_toast_1262; ERROR: cannot reindex system catalogs concurrently -- system catalog, fail REINDEX (TABLESPACE pg_global) TABLE pg_authid; @@ -926,6 +926,12 @@ DETAIL: tablespace for index testschema.part_a_idx ALTER INDEX testschema.part_a_idx SET TABLESPACE pg_default; -- Fail, not empty DROP TABLESPACE regress_tblspace; +-- Adequate cache initialization before GRANT +\c - +BEGIN; +GRANT ALL ON TABLESPACE regress_tblspace TO PUBLIC; +ERROR: tablespace "regress_tblspace" does not exist +ROLLBACK; CREATE ROLE regress_tablespace_user1 login; CREATE ROLE regress_tablespace_user2 login; GRANT USAGE ON SCHEMA testschema TO regress_tablespace_user2; From 8a73efbed47721d30675ee5e05e5307bff500314 Mon Sep 17 00:00:00 2001 From: Victor Polevoy Date: Wed, 16 Jul 2025 10:49:42 +0200 Subject: [PATCH 44/50] NEON: Only allocate and use the buffer for SLRU segment with the old communicator. --- src/backend/access/transam/slru.c | 38 +++++++++++++++++++------------ src/backend/utils/init/globals.c | 1 + src/include/miscadmin.h | 6 +++++ 3 files changed, 30 insertions(+), 15 deletions(-) diff --git a/src/backend/access/transam/slru.c b/src/backend/access/transam/slru.c index 8dae1c48df9..cabb6ed763e 100644 --- a/src/backend/access/transam/slru.c +++ b/src/backend/access/transam/slru.c @@ -744,7 +744,12 @@ SimpleLruDownloadSegment(SlruCtl ctl, int pageno, char const* path) segno = pageno / SLRU_PAGES_PER_SEGMENT; - buffer = palloc(BLCKSZ * SLRU_PAGES_PER_SEGMENT); + if (neon_use_communicator_worker) { + buffer = NULL; + } else { + buffer = palloc(BLCKSZ * SLRU_PAGES_PER_SEGMENT); + } + n_blocks = read_slru_segment(path, segno, buffer); if (n_blocks > 0) { @@ -756,22 +761,25 @@ SimpleLruDownloadSegment(SlruCtl ctl, int pageno, char const* path) pfree(buffer); return -1; } - errno = 0; - pgstat_report_wait_start(WAIT_EVENT_SLRU_WRITE); - if (pg_pwrite(fd, buffer, n_blocks*BLCKSZ, 0) != n_blocks*BLCKSZ) - { - pgstat_report_wait_end(); - /* if write didn't set errno, assume problem is no disk space */ - if (errno == 0) - errno = ENOSPC; - slru_errcause = SLRU_WRITE_FAILED; - slru_errno = errno; - CloseTransientFile(fd); - pfree(buffer); - return -1; + if (!neon_use_communicator_worker) { + errno = 0; + pgstat_report_wait_start(WAIT_EVENT_SLRU_WRITE); + if (pg_pwrite(fd, buffer, n_blocks*BLCKSZ, 0) != n_blocks*BLCKSZ) + { + pgstat_report_wait_end(); + /* if write didn't set errno, assume problem is no disk space */ + if (errno == 0) + errno = ENOSPC; + slru_errcause = SLRU_WRITE_FAILED; + slru_errno = errno; + + CloseTransientFile(fd); + pfree(buffer); + return -1; + } + pgstat_report_wait_end(); } - pgstat_report_wait_end(); } pfree(buffer); return fd; diff --git a/src/backend/utils/init/globals.c b/src/backend/utils/init/globals.c index d31cb45a058..345d563c869 100644 --- a/src/backend/utils/init/globals.c +++ b/src/backend/utils/init/globals.c @@ -119,6 +119,7 @@ pid_t PostmasterPid = 0; bool IsPostmasterEnvironment = false; bool IsUnderPostmaster = false; bool IsBinaryUpgrade = false; +bool neon_use_communicator_worker = false; bool ExitOnAnyError = false; diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h index 85092bc91d0..4f8a5548da3 100644 --- a/src/include/miscadmin.h +++ b/src/include/miscadmin.h @@ -170,6 +170,12 @@ extern PGDLLIMPORT pid_t PostmasterPid; extern PGDLLIMPORT bool IsPostmasterEnvironment; extern PGDLLIMPORT bool IsUnderPostmaster; extern PGDLLIMPORT bool IsBinaryUpgrade; +/* Whether the communicator worker is used or not. Defined here so that + * it is also accessible from the main postgres code easily without + * having to look up the value using strings and chain of other + * functions. + */ +extern PGDLLIMPORT bool neon_use_communicator_worker; extern PGDLLIMPORT bool ExitOnAnyError; From db361c1b2f7422b3c725dc70613fa37b0eaf231c Mon Sep 17 00:00:00 2001 From: Dimitri Fontaine Date: Fri, 18 Jul 2025 16:54:43 +0200 Subject: [PATCH 45/50] pg hooks for online table (#693) * pg hooks for online table (#24) Adding hooks for PG online table. - PreOnlineTableOp_hook - PostOnlineTableOp_hook - OnlineTableSecurityLabel_hook The 3 hooks are all used to bypass the ownership check on an online table. The Pre/Post hooks are used to allow non-online table owner to perform a few DDLs on the table, e.g., CREATE INDEX. - Pre hook will check the DDL type and whether the referenced relation is an online table. If the conditions are met, it will set the session user as the table owner. The rest of the PG code will work since the session is acting as the table owner now. - Post hook will restore the current user to the session user. OnlineTableSecurityLabel_hook is used to allow customers to add who can perform DDL on an online table. By default, the databricks_superuser can. OnlineTableSecurityLabel_hook will check if the session user is one of the roles attached to the online table. If it is, it will allow updating the label to add / remove roles. Otherwise, it will throw the permission error. Why we add them? 1. These 3 hooks are very specific for online table. We don't intend to use them for other purposes. 2. This is also the reason we are not overloading the public hook also. I need to be very specific on the code point where we want to set as table owner. standard_ProcessUtility contains too many stuff. I don't want to make our change generalize to all PG utilities. The extension change is inside https://github.com/databricks-eng/hadron/pull/492 * First pass of review. --------- Co-authored-by: Haoyu Huang --- src/backend/commands/seclabel.c | 19 ++++++++++++++++--- src/backend/tcop/utility.c | 19 +++++++++++++++++++ src/include/commands/seclabel.h | 16 ++++++++++++++++ src/include/tcop/utility.h | 23 +++++++++++++++++++++++ 4 files changed, 74 insertions(+), 3 deletions(-) diff --git a/src/backend/commands/seclabel.c b/src/backend/commands/seclabel.c index cee5d7bbb9c..82362dac396 100644 --- a/src/backend/commands/seclabel.c +++ b/src/backend/commands/seclabel.c @@ -33,6 +33,10 @@ typedef struct static List *label_provider_list = NIL; +/* BEGIN_PG_NEON */ +OnlineTableSecurityLabel_hook_type OnlineTableSecurityLabel_hook = NULL; +/* END_PG_NEON */ + static bool SecLabelSupportsObjectType(ObjectType objtype) { @@ -168,9 +172,18 @@ ExecSecLabelStmt(SecLabelStmt *stmt) address = get_object_address(stmt->objtype, stmt->object, &relation, ShareUpdateExclusiveLock, false); - /* Require ownership of the target object. */ - check_object_ownership(GetUserId(), stmt->objtype, address, - stmt->object, relation); + /* BEGIN_PG_NEON */ + if (OnlineTableSecurityLabel_hook != NULL) + { + OnlineTableSecurityLabel_hook(stmt, &address, relation); + } + else + { + /* Require ownership of the target object. */ + check_object_ownership(GetUserId(), stmt->objtype, address, + stmt->object, relation); + } + /* END_PG_NEON */ /* Perform other integrity checks as needed. */ switch (stmt->objtype) diff --git a/src/backend/tcop/utility.c b/src/backend/tcop/utility.c index 25fe3d58016..6f25ca35276 100644 --- a/src/backend/tcop/utility.c +++ b/src/backend/tcop/utility.c @@ -69,6 +69,11 @@ /* Hook for plugins to get control in ProcessUtility() */ ProcessUtility_hook_type ProcessUtility_hook = NULL; +/* BEGIN_PG_NEON */ +PreOnlineTableOp_hook_type PreOnlineTableOp_hook = NULL; +PostOnlineTableOp_hook_type PostOnlineTableOp_hook = NULL; +/* END_PG_NEON */ + /* local function declarations */ static int ClassifyUtilityCommandAsReadOnly(Node *parsetree); static void ProcessUtilitySlow(ParseState *pstate, @@ -1115,6 +1120,13 @@ ProcessUtilitySlow(ParseState *pstate, if (isCompleteQuery) EventTriggerDDLCommandStart(parsetree); + /* BEGIN_PG_NEON */ + if (PreOnlineTableOp_hook != NULL) + { + PreOnlineTableOp_hook(parsetree); + } + /* END_PG_NEON */ + switch (nodeTag(parsetree)) { /* @@ -1935,6 +1947,13 @@ ProcessUtilitySlow(ParseState *pstate, } PG_FINALLY(); { + /* BEGIN_PG_NEON */ + if (PostOnlineTableOp_hook != NULL) + { + PostOnlineTableOp_hook(parsetree); + } + /* END_PG_NEON */ + if (needCleanup) EventTriggerEndCompleteQuery(); } diff --git a/src/include/commands/seclabel.h b/src/include/commands/seclabel.h index 4550be94bc9..b9bb1af9f1c 100644 --- a/src/include/commands/seclabel.h +++ b/src/include/commands/seclabel.h @@ -31,4 +31,20 @@ typedef void (*check_object_relabel_type) (const ObjectAddress *object, extern void register_label_provider(const char *provider_name, check_object_relabel_type hook); +/* BEGIN_PG_NEON */ + +/* + * OnlineTableSecurityLabel_hook is used to allow customers to perform DDL on + * an online table. By default, only databricks_superuser can. + * OnlineTableSecurityLabel_hook checks if the session user is one of the + * roles attached to the online table. If it is, it allows updating the label + * to add/remove roles. Otherwise, a permission error is thrown. + */ +typedef void (*OnlineTableSecurityLabel_hook_type) (SecLabelStmt *stmt, + const ObjectAddress *object, + Relation relation); + +extern PGDLLIMPORT OnlineTableSecurityLabel_hook_type OnlineTableSecurityLabel_hook; +/* END_PG_NEON */ + #endif /* SECLABEL_H */ diff --git a/src/include/tcop/utility.h b/src/include/tcop/utility.h index 2f05dd50e25..15ccb3b0f23 100644 --- a/src/include/tcop/utility.h +++ b/src/include/tcop/utility.h @@ -77,6 +77,29 @@ typedef void (*ProcessUtility_hook_type) (PlannedStmt *pstmt, DestReceiver *dest, QueryCompletion *qc); extern PGDLLIMPORT ProcessUtility_hook_type ProcessUtility_hook; +/* BEGIN_PG_NEON */ + +/* + * The Pre/Post hooks are used to allow roles that do not own an online table + * to still perform a few DDLs on the table, e.g., CREATE INDEX. + * + * - Pre hook will check the DDL type and whether the referenced relation is + * an online table. If the conditions are met, it will set the session user + * as the table owner. + * + * Postgres code then (typically) executes as the session user, which is now + * SET to the table owner. + * + * - Post hook will restore the current user to the session user. + */ +typedef void (*PreOnlineTableOp_hook_type) (Node *parsetree); +extern PGDLLIMPORT PreOnlineTableOp_hook_type PreOnlineTableOp_hook; + +typedef void (*PostOnlineTableOp_hook_type) (Node *parsetree); +extern PGDLLIMPORT PostOnlineTableOp_hook_type PostOnlineTableOp_hook; + +/* END_PG_NEON */ + extern void ProcessUtility(PlannedStmt *pstmt, const char *queryString, bool readOnlyTree, ProcessUtilityContext context, ParamListInfo params, From 1fa98173e8cc911718bbda9ce4029b67166d2c86 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Mon, 30 Jun 2025 22:24:21 +0300 Subject: [PATCH 46/50] Reintialize page in allocNewBuffer only when buffer is returned --- src/backend/access/spgist/spgutils.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/backend/access/spgist/spgutils.c b/src/backend/access/spgist/spgutils.c index 95fea74e296..ff6b88aa419 100644 --- a/src/backend/access/spgist/spgutils.c +++ b/src/backend/access/spgist/spgutils.c @@ -525,10 +525,10 @@ allocNewBuffer(Relation index, int flags) Buffer buffer; buffer = SpGistNewBuffer(index); - SpGistInitBuffer(buffer, pageflags); if (pageflags & SPGIST_LEAF) { + SpGistInitBuffer(buffer, pageflags); /* Leaf pages have no parity concerns, so just use it */ return buffer; } @@ -539,6 +539,7 @@ allocNewBuffer(Relation index, int flags) if ((flags & GBUF_PARITY_MASK) == blkFlags) { + SpGistInitBuffer(buffer, pageflags); /* Page has right parity, use it */ return buffer; } From dbfab16d8abc96226a05f8217dc2a491dfe22ed2 Mon Sep 17 00:00:00 2001 From: Dimitri Fontaine Date: Fri, 25 Jul 2025 11:57:45 +0200 Subject: [PATCH 47/50] Add a CI build to check postgres repository changes. The CI builds Postgres from the source code in this branch installed in the neon vendor/postgres-vXX directory, and then runs the Postgres regression tests and a very simple Neon test that just does CREATE EXTENSION neon. This allows making sure we can still build Neon with the current code in the branch, and load the neon shared library at the server's start-up. Actual neon testing is still maintained in the neon repository, the goal of this CI action is to have a minimum amount of feedback when contributing to the Postgres repository. --- .github/workflows/build.yml | 59 +++++++++++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) create mode 100644 .github/workflows/build.yml diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml new file mode 100644 index 00000000000..ed4868162f6 --- /dev/null +++ b/.github/workflows/build.yml @@ -0,0 +1,59 @@ +name: Build Postgres + +on: + push: + branches: + - REL_18_STABLE_neon + pull_request: + branches: + - REL_18_STABLE_neon + +jobs: + build_postgres: + name: Build Postgres + runs-on: ubuntu-latest + steps: + - name: Install Postgres Build Dependencies + run: | + sudo apt-get install build-essential coreutils libreadline-dev zlib1g-dev flex bison libxml2-dev libxslt-dev libssl-dev libxml2-utils xsltproc git + + - name: Install Neon Build Dependencies + run: | + sudo apt install libtool libseccomp-dev clang pkg-config cmake postgresql-client protobuf-compiler libprotobuf-dev libcurl4-openssl-dev openssl libicu-dev + + - name: Checkout Neon main branch + run: | + git clone https://github.com/neondatabase/neon ./neon + + - name: Checkout postgres repository + uses: actions/checkout@v4 + with: + path: './neon/vendor/postgres-v18' + + - name: Build PostgreSQL and Neon Extension + run: | + make -s -j`nproc` -C ./neon -s neon-pg-ext-v18 + + - name: Run PostgreSQL Test Suite + run: | + make -s -j`nproc` -C ./neon/build/v18 check + + - name: Append Postgres binaries to the PATH + run: | + echo "./neon/pg_install/v18/bin" >> "$GITHUB_PATH" + + - name: Start Postgres + run: | + pg_ctl init --pgdata ./data + pg_ctl start --pgdata ./data -o '-c shared_preload_libraries=neon' + + - name: Create Extension Neon + env: + PGHOST: /tmp + PGDATABASE: postgres + run: | + psql --echo-queries --command 'create extension neon;' + + - name: Stop Postgres + run: | + pg_ctl stop --pgdata ./data From 73c4b6f0370c6d01a0d2e266d75825cb561a3d86 Mon Sep 17 00:00:00 2001 From: Matthias van de Meent Date: Tue, 5 Aug 2025 10:58:42 +0200 Subject: [PATCH 48/50] Fix some compiler warnings To be squashed into the relevant commit --- src/backend/access/transam/xlogreader.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/backend/access/transam/xlogreader.c b/src/backend/access/transam/xlogreader.c index ba148a795b1..834be42d02c 100644 --- a/src/backend/access/transam/xlogreader.c +++ b/src/backend/access/transam/xlogreader.c @@ -1593,10 +1593,10 @@ WALRead(XLogReaderState *state, nbytes = count; #ifndef FRONTEND - if ((startptr & wal_segment_size - 1) < XLOG_BLCKSZ && (recptr & wal_segment_size - 1) > SizeOfXLogLongPHD) + if ((startptr & (wal_segment_size - 1)) < XLOG_BLCKSZ && (recptr & (wal_segment_size - 1)) > SizeOfXLogLongPHD) { XLogLongPageHeaderData lphdr; - int baseoff = startptr & wal_segment_size - 1; + int baseoff = startptr & (wal_segment_size - 1); memcpy(&lphdr, &p[-baseoff], SizeOfXLogLongPHD); Assert(lphdr.std.xlp_info & XLP_LONG_HEADER); @@ -1678,10 +1678,10 @@ WALRead(XLogReaderState *state, } #ifndef FRONTEND - if ((recptr & wal_segment_size - 1) <= XLOG_BLCKSZ && (recptr & wal_segment_size - 1) > SizeOfXLogLongPHD) + if ((recptr & (wal_segment_size - 1)) <= XLOG_BLCKSZ && (recptr & (wal_segment_size - 1)) > SizeOfXLogLongPHD) { XLogLongPageHeaderData lphdr; - int baseoff = recptr & wal_segment_size - 1; + int baseoff = recptr & (wal_segment_size - 1); memcpy(&lphdr, &p[-baseoff], SizeOfXLogLongPHD); Assert(lphdr.std.xlp_info & XLP_LONG_HEADER); From 819ea4d941664703d73163d7e0e7398f3abfd8a3 Mon Sep 17 00:00:00 2001 From: Matthias van de Meent Date: Tue, 5 Aug 2025 18:55:41 +0200 Subject: [PATCH 49/50] Fix issue in FinishWalRecovery with NeonRecoveryRequested Also removes the debug logging and assertions that were added to find the issue The issue was that we failed to correctly initialize the page if the page was the first page of a segment, and should thus have a long header. --- src/backend/access/transam/xlogreader.c | 35 +---------------------- src/backend/access/transam/xlogrecovery.c | 11 +++++++ src/backend/access/transam/xlogutils.c | 3 -- 3 files changed, 12 insertions(+), 37 deletions(-) diff --git a/src/backend/access/transam/xlogreader.c b/src/backend/access/transam/xlogreader.c index 834be42d02c..cbe45044f72 100644 --- a/src/backend/access/transam/xlogreader.c +++ b/src/backend/access/transam/xlogreader.c @@ -80,11 +80,6 @@ report_invalid_record(XLogReaderState *state, const char *fmt,...) va_end(args); state->errormsg_deferred = true; - -#ifndef FRONTEND - ereport(PANIC, (errmsg_internal("%s", state->errormsg_buf), - errbacktrace())); -#endif } /* @@ -1369,11 +1364,10 @@ XLogReaderValidatePageHeader(XLogReaderState *state, XLogRecPtr recptr, /* hmm, first page of file doesn't have a long header? */ report_invalid_record(state, - "invalid info bits %04X in WAL segment %s, LSN %X/%X, page addr %X/%X, offset %u", + "invalid info bits %04X in WAL segment %s, LSN %X/%X, offset %u", hdr->xlp_info, fname, LSN_FORMAT_ARGS(recptr), - LSN_FORMAT_ARGS(hdr->xlp_pageaddr), offset); return false; } @@ -1584,26 +1578,10 @@ WALRead(XLogReaderState *state, instr_time io_start; #endif -#ifndef FRONTEND - elog(LOG, "WALRead xlog startptr=%X/%08X count=%d", - LSN_FORMAT_ARGS(startptr), (int) count); -#endif p = buf; recptr = startptr; nbytes = count; -#ifndef FRONTEND - if ((startptr & (wal_segment_size - 1)) < XLOG_BLCKSZ && (recptr & (wal_segment_size - 1)) > SizeOfXLogLongPHD) - { - XLogLongPageHeaderData lphdr; - int baseoff = startptr & (wal_segment_size - 1); - - memcpy(&lphdr, &p[-baseoff], SizeOfXLogLongPHD); - Assert(lphdr.std.xlp_info & XLP_LONG_HEADER); - Assert(lphdr.std.xlp_info & XLP_FIRST_IS_CONTRECORD || lphdr.std.xlp_rem_len == 0); - } -#endif - while (nbytes > 0) { uint32 startoff; @@ -1677,17 +1655,6 @@ WALRead(XLogReaderState *state, p += readbytes; } -#ifndef FRONTEND - if ((recptr & (wal_segment_size - 1)) <= XLOG_BLCKSZ && (recptr & (wal_segment_size - 1)) > SizeOfXLogLongPHD) - { - XLogLongPageHeaderData lphdr; - int baseoff = recptr & (wal_segment_size - 1); - - memcpy(&lphdr, &p[-baseoff], SizeOfXLogLongPHD); - Assert(lphdr.std.xlp_info & XLP_LONG_HEADER); - Assert(lphdr.std.xlp_info & XLP_FIRST_IS_CONTRECORD || lphdr.std.xlp_rem_len == 0); - } -#endif return true; } diff --git a/src/backend/access/transam/xlogrecovery.c b/src/backend/access/transam/xlogrecovery.c index 376317597e3..3503c5f85a1 100644 --- a/src/backend/access/transam/xlogrecovery.c +++ b/src/backend/access/transam/xlogrecovery.c @@ -1717,6 +1717,17 @@ FinishWalRecovery(void) */ xlogPageHdr->xlp_rem_len = offs - lastPageSize; xlogPageHdr->xlp_info = (xlogPageHdr->xlp_rem_len > 0) ? XLP_FIRST_IS_CONTRECORD : 0; + + /* Populate the long header correctly, if */ + if ((pageBeginPtr & (wal_segment_size - 1)) == 0) + { + XLogLongPageHeader longHdr = (XLogLongPageHeader) xlogPageHdr; + xlogPageHdr->xlp_info |= XLP_LONG_HEADER; + longHdr->xlp_seg_size = wal_segment_size; + longHdr->xlp_sysid = GetSystemIdentifier(); + longHdr->xlp_xlog_blcksz = XLOG_BLCKSZ; + } + readOff = XLogSegmentOffset(pageBeginPtr, wal_segment_size); result->lastPageBeginPtr = pageBeginPtr; diff --git a/src/backend/access/transam/xlogutils.c b/src/backend/access/transam/xlogutils.c index 21d4fa4afcd..552cb92f3e6 100644 --- a/src/backend/access/transam/xlogutils.c +++ b/src/backend/access/transam/xlogutils.c @@ -862,9 +862,6 @@ int read_local_xlog_page(XLogReaderState *state, XLogRecPtr targetPagePtr, int reqLen, XLogRecPtr targetRecPtr, char *cur_page) { - elog(LOG, "Reading local xlog page=%X/%08X len=%d recptr=%X/%08X", - LSN_FORMAT_ARGS(targetPagePtr), reqLen, - LSN_FORMAT_ARGS(targetRecPtr)); return read_local_xlog_page_guts(state, targetPagePtr, reqLen, targetRecPtr, cur_page, true); } From d6cecdfdf2d0fe5fffa528467f63211486bb3e79 Mon Sep 17 00:00:00 2001 From: Suhas Thalanki Date: Thu, 7 Aug 2025 15:36:21 -0400 Subject: [PATCH 50/50] update hook type to be in-line with hadron #1580 --- src/include/storage/smgr.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h index a4c5e0b03ef..66217c6a557 100644 --- a/src/include/storage/smgr.h +++ b/src/include/storage/smgr.h @@ -155,7 +155,7 @@ extern finish_unlogged_build_phase_1_hook_type finish_unlogged_build_phase_1_hoo extern end_unlogged_build_hook_type end_unlogged_build_hook; /* NEON: Hook for reading an SLRU segment from e.g. remote storage */ -typedef int (*read_slru_segment_hook_type) (const char *path, int segno, void* buffer); +typedef bool (*read_slru_segment_hook_type) (const char *path, int segno, void* buffer); extern read_slru_segment_hook_type read_slru_segment_hook; /* NEON: Alternative implementation of calculate_database_size(), to make it O(1) */