Обсуждение: Internal error XX000 with enable_partition_pruning=on, pg 11 beta1on Debian
Hi,
I obtained an XX000 error testing my DSS application with PostgreSQL 11 beta 1.
Here is a simplified version of my test, no data in the tables :
select version();
version
-----------------------------------------------------------------------------------------------------------------------
PostgreSQL 11beta1 (Debian 11~beta1-2.pgdg+1) on x86_64-pc-linux-gnu, compiled by gcc (Debian 7.3.0-19) 7.3.0, 64-bit
(1 ligne)
-- connected superuser -- postgres
create user a password 'a';
create schema a authorization a;
create user b password 'b';
create schema b authorization b;
create user c password 'c';
create schema c authorization c;
create user z password 'z';
create schema z authorization z;
-- connected a
create table t1(k1 timestamp, c1 int);
create view v as select k1, c1 from t1;
grant usage on schema a to z;
grant select on all tables in schema a to z;
-- connected b
create table t2(k1 timestamp, c1 int) partition by range(k1);
create table t2_2016 partition of t2 for values from ('2016-01-01') to ('2017-01-01');
create table t2_2017 partition of t2 for values from ('2017-01-01') to ('2018-01-01');
create table t2_2018 partition of t2 for values from ('2018-01-01') to ('2019-01-01');
create view v as select k1, c1 from t2;
grant select on all tables in schema b to z;
grant usage on schema b to z;
-- connected c
create table t3(k1 timestamp, c1 int) partition by range(k1);
create table t3_2016 partition of t3 for values from ('2016-01-01') to ('2017-01-01');
create table t3_2017 partition of t3 for values from ('2017-01-01') to ('2018-01-01');
create table t3_2018 partition of t3 for values from ('2018-01-01') to ('2019-01-01');
create view v as select k1, c1 from t3;
grant select on all tables in schema c to z;
grant usage on schema c to z;
-- connected z
create view v as
select k1, c1 from
(select * from a.v
UNION ALL
select * from b.v
UNION ALL
select * from c.v) vabc ;
explain analyze select * from v where v.k1 > date '2017-01-01';
ERREUR: XX000: did not find all requested child rels in append_rel_list
EMPLACEMENT : find_appinfos_by_relids, prepunion.c : 2643
set enable_partition_pruning=off;
SET
explain analyze select * from v where v.k1 > date '2017-01-01';
QUERY PLAN
-----------------------------------------------------------------------------------------------------------
Append (cost=0.00..272.30 rows=4760 width=12) (actual time=0.217..0.217 rows=0 loops=1)
-> Seq Scan on t1 (cost=0.00..35.50 rows=680 width=12) (actual time=0.020..0.020 rows=0 loops=1)
Filter: (k1 > '2017-01-01'::date)
-> Seq Scan on t2_2016 (cost=0.00..35.50 rows=680 width=12) (actual time=0.035..0.035 rows=0 loops=1)
Filter: (k1 > '2017-01-01'::date)
-> Seq Scan on t2_2017 (cost=0.00..35.50 rows=680 width=12) (actual time=0.016..0.016 rows=0 loops=1)
Filter: (k1 > '2017-01-01'::date)
-> Seq Scan on t2_2018 (cost=0.00..35.50 rows=680 width=12) (actual time=0.015..0.015 rows=0 loops=1)
Filter: (k1 > '2017-01-01'::date)
-> Seq Scan on t3_2016 (cost=0.00..35.50 rows=680 width=12) (actual time=0.040..0.040 rows=0 loops=1)
Filter: (k1 > '2017-01-01'::date)
-> Seq Scan on t3_2017 (cost=0.00..35.50 rows=680 width=12) (actual time=0.016..0.016 rows=0 loops=1)
Filter: (k1 > '2017-01-01'::date)
-> Seq Scan on t3_2018 (cost=0.00..35.50 rows=680 width=12) (actual time=0.016..0.016 rows=0 loops=1)
Filter: (k1 > '2017-01-01'::date)
Planning Time: 0.639 ms
Execution Time: 0.400 ms
set enable_partition_pruning=on;
SET
explain analyze select * from v where v.k1 > date '2017-01-01';
ERREUR: XX000: did not find all requested child rels in append_rel_list
EMPLACEMENT : find_appinfos_by_relids, prepunion.c : 2643
-- 10
select version();
version
--------------------------------------------------------------------------------------------------------------------------------------------
PostgreSQL 10.4 (Ubuntu 10.4-2.pgdg16.04+1) on x86_64-pc-linux-gnu, compiled by gcc (Ubuntu 5.4.0-6ubuntu1~16.04.9) 5.4.0 20160609, 64-bit
(1 ligne)
-- connected superuser -- postgres
create user a password 'a';
create schema a authorization a;
create user b password 'b';
create schema b authorization b;
create user c password 'c';
create schema c authorization c;
create user z password 'z';
create schema z authorization z;
-- connected a
create table t1(k1 timestamp, c1 int);
create view v as select k1, c1 from t1;
grant usage on schema a to z;
grant select on all tables in schema a to z;
-- connected b
create table t2(k1 timestamp, c1 int) partition by range(k1);
create table t2_2016 partition of t2 for values from ('2016-01-01') to ('2017-01-01');
create table t2_2017 partition of t2 for values from ('2017-01-01') to ('2018-01-01');
create table t2_2018 partition of t2 for values from ('2018-01-01') to ('2019-01-01');
create view v as select k1, c1 from t2;
grant select on all tables in schema b to z;
grant usage on schema b to z;
-- connected c
create table t3(k1 timestamp, c1 int) partition by range(k1);
create table t3_2016 partition of t3 for values from ('2016-01-01') to ('2017-01-01');
create table t3_2017 partition of t3 for values from ('2017-01-01') to ('2018-01-01');
create table t3_2018 partition of t3 for values from ('2018-01-01') to ('2019-01-01');
create view v as select k1, c1 from t3;
grant select on all tables in schema c to z;
grant usage on schema c to z;
-- connected z
create view v as
select k1, c1 from
(select * from a.v
UNION ALL
select * from b.v
UNION ALL
select * from c.v) vabc ;
explain analyze select * from v where v.k1 > date '2017-01-01';
QUERY PLAN
-----------------------------------------------------------------------------------------------------------
Append (cost=0.00..177.50 rows=3400 width=12) (actual time=0.206..0.206 rows=0 loops=1)
-> Seq Scan on t1 (cost=0.00..35.50 rows=680 width=12) (actual time=0.044..0.044 rows=0 loops=1)
Filter: (k1 > '2017-01-01'::date)
-> Seq Scan on t2_2017 (cost=0.00..35.50 rows=680 width=12) (actual time=0.020..0.020 rows=0 loops=1)
Filter: (k1 > '2017-01-01'::date)
-> Seq Scan on t2_2018 (cost=0.00..35.50 rows=680 width=12) (actual time=0.020..0.020 rows=0 loops=1)
Filter: (k1 > '2017-01-01'::date)
-> Seq Scan on t3_2017 (cost=0.00..35.50 rows=680 width=12) (actual time=0.036..0.036 rows=0 loops=1)
Filter: (k1 > '2017-01-01'::date)
-> Seq Scan on t3_2018 (cost=0.00..35.50 rows=680 width=12) (actual time=0.020..0.020 rows=0 loops=1)
Filter: (k1 > '2017-01-01'::date)
Planning time: 0.780 ms
Execution time: 0.427 ms
Best regards
Phil
Phil Florent <philflorent@hotmail.com> writes: > explain analyze select * from v where v.k1 > date '2017-01-01'; > ERREUR: XX000: did not find all requested child rels in append_rel_list > EMPLACEMENT : find_appinfos_by_relids, prepunion.c : 2643 Reproduced here, thanks for the report! This is very timely since we were just in process of rewriting that code anyway ... regards, tom lane
Re: Internal error XX000 with enable_partition_pruning=on, pg 11beta1 on Debian
On 9 June 2018 at 04:57, Tom Lane <tgl@sss.pgh.pa.us> wrote: > Phil Florent <philflorent@hotmail.com> writes: >> explain analyze select * from v where v.k1 > date '2017-01-01'; >> ERREUR: XX000: did not find all requested child rels in append_rel_list >> EMPLACEMENT : find_appinfos_by_relids, prepunion.c : 2643 > > Reproduced here, thanks for the report! This is very timely since > we were just in process of rewriting that code anyway ... Yeah. Thanks for the report Phil. It looks like this was 499be013de6, which was one of mine. A more simple case to reproduce is: drop table listp; create table listp (a int, b int) partition by list(a); create table listp1 partition of listp for values in (1); select * from (select * from listp union all select * from listp) t where a = 1; I'll look in more detail after sleeping. -- David Rowley http://www.2ndQuadrant.com/ PostgreSQL Development, 24x7 Support, Training & Services
Re: Internal error XX000 with enable_partition_pruning=on, pg 11beta1 on Debian
On 9 June 2018 at 06:50, David Rowley <david.rowley@2ndquadrant.com> wrote: > It looks like this was 499be013de6, which was one of mine. > > A more simple case to reproduce is: > > drop table listp; > create table listp (a int, b int) partition by list(a); > create table listp1 partition of listp for values in (1); > select * from (select * from listp union all select * from listp) t where a = 1; > > I'll look in more detail after sleeping. So it looks like I've assumed that the Append path's partitioned_rels will only ever be set for partitioned tables, but it can, in fact, be set for UNION ALL parents too when the union children are partitioned tables. As a discussion topic, I've attached a patch which does resolve the error, but it also disables run-time pruning in this case. There might be some way we can treat UNION ALL parents differently when building the PartitionPruneInfos. I've just not thought of what this would be just yet. If I can't think of that, I wonder if this is a rare enough case not to bother with run-time partition pruning. -- David Rowley http://www.2ndQuadrant.com/ PostgreSQL Development, 24x7 Support, Training & Services
Вложения
David Rowley <david.rowley@2ndquadrant.com> writes: > So it looks like I've assumed that the Append path's partitioned_rels > will only ever be set for partitioned tables, but it can, in fact, be > set for UNION ALL parents too when the union children are partitioned > tables. > As a discussion topic, I've attached a patch which does resolve the > error, but it also disables run-time pruning in this case. > There might be some way we can treat UNION ALL parents differently > when building the PartitionPruneInfos. I've just not thought of what > this would be just yet. If I can't think of that, I wonder if this is > a rare enough case not to bother with run-time partition pruning. So, IIUC, the issue is that for partitioning cases Append expects *all* its children to be partitions of the *same* partitioned table? That is, you could also break it with select * from partitioned_table_a union all select * from partitioned_table_b ? If so, maybe the best solution is to not allow a partitioning appendrel to be flattened into an appendrel generated in other ways (particularly, via UNION ALL). I also wonder whether it was a bad idea to treat these as the same kind of path/plan in the first place. regards, tom lane
Re: Internal error XX000 with enable_partition_pruning=on, pg 11beta1 on Debian
On 10 June 2018 at 04:48, Tom Lane <tgl@sss.pgh.pa.us> wrote: > So, IIUC, the issue is that for partitioning cases Append expects *all* > its children to be partitions of the *same* partitioned table? That > is, you could also break it with > > select * from partitioned_table_a > union all > select * from partitioned_table_b > > ? Not quite. I think what I sent above is the most simple way to break it. Your case won't because there are no quals to prune with, so run-time pruning is never attempted. > If so, maybe the best solution is to not allow a partitioning appendrel > to be flattened into an appendrel generated in other ways (particularly, > via UNION ALL). I also wonder whether it was a bad idea to treat these > as the same kind of path/plan in the first place. That might be the best idea. I'll look into that now. The only drawback I see is that we'll end up pulling tuples through more Append nodes in cases like you mentioned above. -- David Rowley http://www.2ndQuadrant.com/ PostgreSQL Development, 24x7 Support, Training & Services
David Rowley <david.rowley@2ndquadrant.com> writes: > On 10 June 2018 at 04:48, Tom Lane <tgl@sss.pgh.pa.us> wrote: >> So, IIUC, the issue is that for partitioning cases Append expects *all* >> its children to be partitions of the *same* partitioned table? That >> is, you could also break it with >> >> select * from partitioned_table_a >> union all >> select * from partitioned_table_b >> >> ? > Not quite. I think what I sent above is the most simple way to break > it. Your case won't because there are no quals to prune with, so > run-time pruning is never attempted. Well, I hadn't bothered to put in the extra code needed to have a qual to prune with, but my point remains that it doesn't seem like the current Append code is prepared to cope with an Append that contains partitions of more than one top-level partitioned table. I just had a thought that might lead to a nice solution to that, or might be totally crazy. What if we inverted the sense of the bitmaps that track partition pruning state, so that instead of a bitmap of valid partitions that need to be scanned, we had a bitmap of pruned partitions that we know we don't need to scan? (The indexes of this bitmap would be subplan indexes not partition indexes.) With this representation, it doesn't matter if some of the Append's children are not supposed to participate in pruning; they just don't ever get added to the bitmap of what to skip. It's also fairly clear, I think, how to handle independent pruning rules for different top-level tables that are being unioned together: just OR the what-to-skip bitmaps. But there may be some reason why this isn't workable. regards, tom lane
Re: Internal error XX000 with enable_partition_pruning=on, pg 11beta1 on Debian
On 11 June 2018 at 12:19, Tom Lane <tgl@sss.pgh.pa.us> wrote: > David Rowley <david.rowley@2ndquadrant.com> writes: >> On 10 June 2018 at 04:48, Tom Lane <tgl@sss.pgh.pa.us> wrote: >>> So, IIUC, the issue is that for partitioning cases Append expects *all* >>> its children to be partitions of the *same* partitioned table? That >>> is, you could also break it with >>> >>> select * from partitioned_table_a >>> union all >>> select * from partitioned_table_b >>> >>> ? > >> Not quite. I think what I sent above is the most simple way to break >> it. Your case won't because there are no quals to prune with, so >> run-time pruning is never attempted. > > Well, I hadn't bothered to put in the extra code needed to have a qual > to prune with, but my point remains that it doesn't seem like the current > Append code is prepared to cope with an Append that contains partitions > of more than one top-level partitioned table. Besides the partition pruning code, was there other aspects of Append that you saw to be incompatible with mixed hierarchies? > I just had a thought that might lead to a nice solution to that, or > might be totally crazy. What if we inverted the sense of the bitmaps > that track partition pruning state, so that instead of a bitmap of > valid partitions that need to be scanned, we had a bitmap of pruned > partitions that we know we don't need to scan? (The indexes of this > bitmap would be subplan indexes not partition indexes.) With this > representation, it doesn't matter if some of the Append's children > are not supposed to participate in pruning; they just don't ever get > added to the bitmap of what to skip. It's also fairly clear, I think, > how to handle independent pruning rules for different top-level tables > that are being unioned together: just OR the what-to-skip bitmaps. > But there may be some reason why this isn't workable. I think it would be less efficient. A common case and one that I very much would like to make as fast as possible is when all but a single partition is pruned. Doing the opposite sounds like more effort would need to be expended to get the subplans that we do need to scan. I don't really see the way it works now as a huge problem to overcome in pruning. We'd just a list of subplans that don't belong to the hierarchy and tag them on to the matches found in ExecFindInitialMatchingSubPlans and ExecFindMatchingSubPlans. The bigger issue to overcome is the mixed flattened list of partition RT indexes in partitioned_rels. Perhaps having a list of Lists for partitioned_rels could be used to resolve that. The question is more, how to solve for PG11. Do we need that? I think we'll very soon be wanting to have ordered partition scans where something like: create table listp(a int) partition by list(a); create index on listp(a); create table listp1 partition of listp for values in (1); create table listp2 partition of listp for values in (2); and select * from listp order by a; would be possible with an Append and Index Scan, rather than having a MergeAppend or Sort. In which case we'll not want mixed partition hierarchies in the Append subplans. Although, perhaps that would mean we just wouldn't pullup AppendPaths which have PathKeys. I have written and attached the patch to stop flattening of partitioned tables into UNION ALL parent's paths, meaning we can now get nested Append and MergeAppend paths. I've added Robert too as I know he was the committer of partitioning and parallel Append. Maybe he has a view on what should be done about this? Is not flattening the paths a problem? -- David Rowley http://www.2ndQuadrant.com/ PostgreSQL Development, 24x7 Support, Training & Services
Вложения
Re: Internal error XX000 with enable_partition_pruning=on, pg 11beta1 on Debian
On 2018/06/11 16:49, David Rowley wrote: > On 11 June 2018 at 12:19, Tom Lane <tgl@sss.pgh.pa.us> wrote: >> David Rowley <david.rowley@2ndquadrant.com> writes: >>> On 10 June 2018 at 04:48, Tom Lane <tgl@sss.pgh.pa.us> wrote: >>>> So, IIUC, the issue is that for partitioning cases Append expects *all* >>>> its children to be partitions of the *same* partitioned table? That >>>> is, you could also break it with >>>> >>>> select * from partitioned_table_a >>>> union all >>>> select * from partitioned_table_b >>>> >>>> ? >> >>> Not quite. That would be correct I think. An Append may contain multiple partitioned tables that all appear under an UNION ALL parent, as in the OP's case and the example above. In this case, the partitioned_rels list of Append consist of non-leaf tables from *all* of the partitioned tables. Before run-time pruning arrived, the only purpose of partitioned_rels list was to make sure that the executor goes through it and locks all of those non-leaf tables (ExecLockNonLeafAppendTables). Run-time pruning expanded its usage by depending it to generate run-time pruning info. >> I just had a thought that might lead to a nice solution to that, or >> might be totally crazy. What if we inverted the sense of the bitmaps >> that track partition pruning state, so that instead of a bitmap of >> valid partitions that need to be scanned, we had a bitmap of pruned >> partitions that we know we don't need to scan? (The indexes of this >> bitmap would be subplan indexes not partition indexes.) With this >> representation, it doesn't matter if some of the Append's children >> are not supposed to participate in pruning; they just don't ever get >> added to the bitmap of what to skip. It's also fairly clear, I think, >> how to handle independent pruning rules for different top-level tables >> that are being unioned together: just OR the what-to-skip bitmaps. >> But there may be some reason why this isn't workable. > > I think it would be less efficient. A common case and one that I very > much would like to make as fast as possible is when all but a single > partition is pruned. Doing the opposite sounds like more effort would > need to be expended to get the subplans that we do need to scan. > > I don't really see the way it works now as a huge problem to overcome > in pruning. We'd just a list of subplans that don't belong to the > hierarchy and tag them on to the matches found in > ExecFindInitialMatchingSubPlans and ExecFindMatchingSubPlans. The > bigger issue to overcome is the mixed flattened list of partition RT > indexes in partitioned_rels. Perhaps having a list of Lists for > partitioned_rels could be used to resolve that. The question is more, > how to solve for PG11. Do we need that? > > I think we'll very soon be wanting to have ordered partition scans > where something like: > > create table listp(a int) partition by list(a); > create index on listp(a); > create table listp1 partition of listp for values in (1); > create table listp2 partition of listp for values in (2); > > and > > select * from listp order by a; > > would be possible with an Append and Index Scan, rather than having a > MergeAppend or Sort. In which case we'll not want mixed partition > hierarchies in the Append subplans. Although, perhaps that would mean > we just wouldn't pullup AppendPaths which have PathKeys. > > I have written and attached the patch to stop flattening of > partitioned tables into UNION ALL parent's paths, meaning we can now > get nested Append and MergeAppend paths. > > I've added Robert too as I know he was the committer of partitioning > and parallel Append. Maybe he has a view on what should be done about > this? Is not flattening the paths a problem? Not speaking for Robert here, just saying from what I know. I don't think your patch breaks anything, even if does change the shape of the plan. So, for: select * from partitioned_table_a union all select * from partitioned_table_b The only thing that changes with the patch is that ExecLockNonLeafAppendTables is called *twice* for the two nested Appends corresponding to partitioned_table_a and partitioned_table_b, resp., instead of just once for the top level Append corresponding to the UNION ALL parent. In fact, when called for the top level Append, ExecLockNonLeafAppendTables is now a no-op. Thanks, Amit
Re: Internal error XX000 with enable_partition_pruning=on, pg 11beta1 on Debian
On 15 June 2018 at 20:37, Amit Langote <Langote_Amit_f8@lab.ntt.co.jp> wrote: > select * from partitioned_table_a > union all > select * from partitioned_table_b > > The only thing that changes with the patch is that > ExecLockNonLeafAppendTables is called *twice* for the two nested Appends > corresponding to partitioned_table_a and partitioned_table_b, resp., > instead of just once for the top level Append corresponding to the UNION > ALL parent. In fact, when called for the top level Append, > ExecLockNonLeafAppendTables is now a no-op. If the top level Append is the UNION ALL one, then there won't be any partitioned_rels. If that's what you mean by no-op then, yeah. There are no duplicate locks already obtained in the parent with the child Append node. -- David Rowley http://www.2ndQuadrant.com/ PostgreSQL Development, 24x7 Support, Training & Services
Re: Internal error XX000 with enable_partition_pruning=on, pg 11beta1 on Debian
On 2018/06/15 20:41, David Rowley wrote: > On 15 June 2018 at 20:37, Amit Langote <Langote_Amit_f8@lab.ntt.co.jp> wrote: >> select * from partitioned_table_a >> union all >> select * from partitioned_table_b >> >> The only thing that changes with the patch is that >> ExecLockNonLeafAppendTables is called *twice* for the two nested Appends >> corresponding to partitioned_table_a and partitioned_table_b, resp., >> instead of just once for the top level Append corresponding to the UNION >> ALL parent. In fact, when called for the top level Append, >> ExecLockNonLeafAppendTables is now a no-op. > > If the top level Append is the UNION ALL one, then there won't be any > partitioned_rels. If that's what you mean by no-op then, yeah. There > are no duplicate locks already obtained in the parent with the child > Append node. Yeah, that's what I meant to say. I looked for whether the locks end up being taken twice, once in the UNION ALL parent's ExecInitAppend and then again in the individual child Appends' ExecInitAppend, but that they don't, so the patch is right. Thanks, Amit
Re: Internal error XX000 with enable_partition_pruning=on, pg 11beta1 on Debian
On 18 June 2018 at 14:36, Amit Langote <Langote_Amit_f8@lab.ntt.co.jp> wrote: > On 2018/06/15 20:41, David Rowley wrote: >> If the top level Append is the UNION ALL one, then there won't be any >> partitioned_rels. If that's what you mean by no-op then, yeah. There >> are no duplicate locks already obtained in the parent with the child >> Append node. > > Yeah, that's what I meant to say. I looked for whether the locks end up > being taken twice, once in the UNION ALL parent's ExecInitAppend and then > again in the individual child Appends' ExecInitAppend, but that they > don't, so the patch is right. Thanks for looking. Robert, do you have any objections to the proposed patch? -- David Rowley http://www.2ndQuadrant.com/ PostgreSQL Development, 24x7 Support, Training & Services
Re: Internal error XX000 with enable_partition_pruning=on, pg 11beta1 on Debian
On Sun, Jun 17, 2018 at 10:59 PM, David Rowley <david.rowley@2ndquadrant.com> wrote: > Thanks for looking. > > Robert, do you have any objections to the proposed patch? I don't have time to study this right now, but I think the main possible objection is around performance. If not flattening the Append is the best way to make queries run fast, then we should do it that way. If making pruning capable of coping with mixed hierarchies is going to be faster, then we should do that. If I were to speculate in the absence of data, my guess would be that failing to flatten the hierarchy is going to lead to a significant per-tuple cost, while the cost of making run-time pruning smarter is likely to be incurred once per rescan (i.e. a lot less). But that might be wrong, and it might be impractical to get this working perfectly in v11 given the time we have. But I would suggest that you performance test a query that ends up feeding lots of tuples through two Append nodes rather than one and see how much it hurts. -- Robert Haas EnterpriseDB: http://www.enterprisedb.com The Enterprise PostgreSQL Company
Re: Internal error XX000 with enable_partition_pruning=on, pg 11beta1 on Debian
On 20 June 2018 at 02:28, Robert Haas <robertmhaas@gmail.com> wrote: > On Sun, Jun 17, 2018 at 10:59 PM, David Rowley >> Robert, do you have any objections to the proposed patch? > > I don't have time to study this right now, but I think the main > possible objection is around performance. If not flattening the > Append is the best way to make queries run fast, then we should do it > that way. If making pruning capable of coping with mixed hierarchies > is going to be faster, then we should do that. If I were to speculate > in the absence of data, my guess would be that failing to flatten the > hierarchy is going to lead to a significant per-tuple cost, while the > cost of making run-time pruning smarter is likely to be incurred once > per rescan (i.e. a lot less). But that might be wrong, and it might > be impractical to get this working perfectly in v11 given the time we > have. But I would suggest that you performance test a query that ends > up feeding lots of tuples through two Append nodes rather than one and > see how much it hurts. I've performed two tests. One to see what the overhead of the additional append is, and one to see what the saving from pruning away unneeded partitions is. I tried to make the 2nd test use a realistic number of partitions. Partition pruning will become more useful with higher numbers of partitions. Test 1: Test overhead of pulling tuples through an additional append create table p (a int) partition by list (a); create table p1 partition of p for values in(1); insert into p select 1 from generate_series(1,1000000); vacuum p1; set max_parallel_workers_per_gather=0; select count(*) from (select * from p union all select * from p) p; Unpatched: tps = 8.530355 (excluding connections establishing) Patched: tps = 7.853939 (excluding connections establishing) Patched version takes 108.61% of the unpatched time. Test 2: Tests time saved from run-time partition pruning and not scanning the index on 23 of the partitions. create table rp (d date) partition by range (d); select 'CREATE TABLE rp' || x::text || ' PARTITION OF rp FOR VALUES FROM (''' || '2017-01-01'::date + (x::text || ' month')::interval || ''') TO (''' || '2017-01-01'::date + ((x+1)::text || ' month')::interval || ''');' from generate_Series(0,23) x; \gexec insert into rp select d::date from generate_series('2017-01-01','2018-12-31', interval '10 sec') d; create index on rp (d); select count(*) from (select * from rp union all select * from rp) rp where d = current_date; Unpatched: (set enable_partition_pruning = 0; to make it work) tps = 260.969953 (excluding connections establishing) Patched: tps = 301.319038 (excluding connections establishing) Patched version takes 86.61% of the unpatched time. So, I don't think that really concludes much. I'd say the overhead shown in test 1 is going to be a bit more predictable as it will depend on how many tuples are being pulled through the additional Append, but the savings shown in test 2 will vary. Having run-time pruning not magically fail to work when the partitioned table is part of a UNION ALL certainly seems less surprising. If I drop the index from the "d" column in test 2, the performance gap increases significantly and is roughly proportional to the number of partitions. Unpatched: tps = 0.523691 (excluding connections establishing) Patched: tps = 13.453964 (excluding connections establishing) -- David Rowley http://www.2ndQuadrant.com/ PostgreSQL Development, 24x7 Support, Training & Services
Re: Internal error XX000 with enable_partition_pruning=on, pg 11beta1 on Debian
On 20 June 2018 at 13:20, David Rowley <david.rowley@2ndquadrant.com> wrote: > select count(*) from (select * from p union all select * from p) p; > > Unpatched: > tps = 8.530355 (excluding connections establishing) > > Patched: > tps = 7.853939 (excluding connections establishing) I've been thinking about this and I'm not so happy about my earlier proposed fix about not flattening the Appends for UNION ALL parents with partitioned relation children. Apart from the additional overhead of pulling all tuples through an additional Append node, I also don't like it because we one day might decide to fix it and make it flatten these again. It would be much nicer not to fool around with the plan shapes like that from one release to the next. So, today I decided to write some code to just make the UNION ALL parents work with partitioned tables while maintaining the Append hierarchy flattening. I've only just completed reading back through all the code and I think it's correct. I ended up changing add_paths_to_append_rel() so that instead of performing concatenation on partitioned_rels from two UNION ALL children, it creates a List of lists. I believe this list can only end up with a 2-level hierarchy of partitioned rels. I tested this and it seems to work as I expect, although I think I need to study the code a bit more to ensure it can't happen. I need to check if there's some cases where nested UNION ALLs cannot be flattened to have a single UNION ALL parent. Supporting this did cause me to have to check the List->type in a few places. I only saw one other place in the code where this is done, so I figured that meant it was okay. In the executor side, I didn't add any pre-calculation code for each partition hierarchy. I did this previously to save having to re-prune on individual partitions, but I figured that was at a good enough level not to have to add it for each partition hierarchy. I imagine most partition hierarchies will just contain a single partitioned table anyway, so an additional level of caching might not buy very much, but I might just be trying to convince myself of that because it's late here now. Anyway... Patch attached. This is method 3 of the 3 methods I thought to fix this, so if this is not suitable then I'm out of ideas. It would be great to get some feedback on this as I'd really like to be done with it before July. -- David Rowley http://www.2ndQuadrant.com/ PostgreSQL Development, 24x7 Support, Training & Services
Вложения
Re: Internal error XX000 with enable_partition_pruning=on, pg 11beta1 on Debian
On 2018-Jun-27, David Rowley wrote: > I've only just completed reading back through all the code and I think > it's correct. I ended up changing add_paths_to_append_rel() so that > instead of performing concatenation on partitioned_rels from two UNION > ALL children, it creates a List of lists. I believe this list can > only end up with a 2-level hierarchy of partitioned rels. I tested > this and it seems to work as I expect, although I think I need to > study the code a bit more to ensure it can't happen. I need to check > if there's some cases where nested UNION ALLs cannot be flattened to > have a single UNION ALL parent. Supporting this did cause me to have > to check the List->type in a few places. I only saw one other place in > the code where this is done, so I figured that meant it was okay. Checking a node's ->type member is not idiomatic -- use IsA() for that. (Strangely, we have IsIntegerList() but only for use within list.c.) But do we rely on the ordering of these lists anywhere? I'm wondering why it's not more sensible to use a bitmapset there (I guess for your list-of-lists business you'd have a list of bms's). I didn't look your patch much further. Since Tom has been revamping this code lately, I think it's a good idea to wait for his input. -- Álvaro Herrera https://www.2ndQuadrant.com/ PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services
Alvaro Herrera <alvherre@2ndquadrant.com> writes: > Since Tom has been revamping this code lately, I think it's a good > idea to wait for his input. I'm on vacation and won't have time to look at this until week after next. If you don't mind putting the topic on hold that long, I'll be happy to take responsibility for it. regards, tom lane
Re: Internal error XX000 with enable_partition_pruning=on, pg 11beta1 on Debian
Hi Tom, On 2018-06-29 18:17:08 -0400, Tom Lane wrote: > Alvaro Herrera <alvherre@2ndquadrant.com> writes: > > Since Tom has been revamping this code lately, I think it's a good > > idea to wait for his input. > > I'm on vacation and won't have time to look at this until week after > next. If you don't mind putting the topic on hold that long, I'll > be happy to take responsibility for it. Is that still the plan? Do you forsee any issues here? This has been somewhat of a longstanding open item... Greetings, Andres Freund
Andres Freund <andres@anarazel.de> writes: > On 2018-06-29 18:17:08 -0400, Tom Lane wrote: >> I'm on vacation and won't have time to look at this until week after >> next. If you don't mind putting the topic on hold that long, I'll >> be happy to take responsibility for it. > Is that still the plan? Do you forsee any issues here? This has been > somewhat of a longstanding open item... It's on my to-do list, but I'm still catching up vacation backlog. Is this item blocking anyone? regards, tom lane
Re: Internal error XX000 with enable_partition_pruning=on, pg 11beta1 on Debian
On 2018-Jul-12, Tom Lane wrote: > Andres Freund <andres@anarazel.de> writes: > > On 2018-06-29 18:17:08 -0400, Tom Lane wrote: > >> I'm on vacation and won't have time to look at this until week after > >> next. If you don't mind putting the topic on hold that long, I'll > >> be happy to take responsibility for it. > > > Is that still the plan? Do you forsee any issues here? This has been > > somewhat of a longstanding open item... > > It's on my to-do list, but I'm still catching up vacation backlog. > Is this item blocking anyone? I don't think so. The patch might conflict with other fixes, but I suppose that's a fact of life. -- Álvaro Herrera https://www.2ndQuadrant.com/ PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services
David Rowley <david.rowley@2ndquadrant.com> writes: > Anyway... Patch attached. This is method 3 of the 3 methods I thought > to fix this, so if this is not suitable then I'm out of ideas. I started to look at this patch. I think this is basically the right direction to go in, but I'm not terribly happy with the details of the data structure design. First off, given that we're now going to have a Plan data structure that accurately reflects the partition hierarchy, I wonder whether we couldn't get rid of the fiddling around with lists of ints and lists of lists of ints; aren't those basically duplicative? They'd certainly be so if we add the rel's RT index to PartitionedRelPruneInfo, but maybe they are even without that. Alvaro complained about the code associated with those lists not being very idiomatic, but I'd like to just get rid of the lists entirely. I especially don't care for keeping the implicit ordering assumptions in those lists (parents before children) when the same info is now going to be explicit in a nearby structure. (This ties into the remarks I just made to Amit about getting rid of executor-startup lock-taking logic. Most of that change only makes sense to do in v12, but I'd prefer that this patch not double down on reliance on data structures we'd like to get rid of.) Second, I still feel that you've got the sense of the prunable-subplans bitmaps backwards, and I do not buy the micro-optimization argument you made for doing it like that. It complicates the code, yet the cost of one bit in a bitmap is completely negligible compared to all the other costs involved in having a per-partition subplan, whether or not that subplan actually gets used in a particular run. One very minor quibble is that I'd be inclined to represent the PartitionPruningData struct like this: typedef struct PartitionPruningData { int num_partrelprunedata; PartitionedRelPruningData partrelprunedata[FLEXIBLE_ARRAY_MEMBER]; } PartitionPruningData; so we can allocate it with one palloc not two. Also, in the Plan representation, I'd suggest avoiding situations where a data structure is sometimes a List of Lists of PartitionedRelPruneInfo and sometimes just a single-level List. Saving one ListCell isn't worth the code complexity and error-proneness of that. regards, tom lane
Re: Internal error XX000 with enable_partition_pruning=on, pg 11beta1 on Debian
On 16 July 2018 at 06:55, Tom Lane <tgl@sss.pgh.pa.us> wrote: > First off, given that we're now going to have a Plan data structure > that accurately reflects the partition hierarchy, I wonder whether we > couldn't get rid of the fiddling around with lists of ints and lists of > lists of ints; aren't those basically duplicative? I'm a bit confused by this. I get that you're talking about the partitioned_rels List, but without that list then how would make_partition_pruneinfo() know what the subnode's parents are? Perhaps we could add a relid field in RelOptInfo to mark the direct parent of a but that does not make getting a unique list very quick when given a List of subplans. A Bitmapset could be used, but we'll end up with a mixed hierarchy with UNION ALL parents. Unsure how to separate those again without some complex processing. I'm not objecting to improving this as I don't really like that list, but I just can't quite think of how else to represent the partition hierarchy. > They'd certainly be > so if we add the rel's RT index to PartitionedRelPruneInfo, but maybe > they are even without that. Alvaro complained about the code associated > with those lists not being very idiomatic, but I'd like to just get rid > of the lists entirely. I especially don't care for keeping the implicit > ordering assumptions in those lists (parents before children) when the > same info is now going to be explicit in a nearby structure. (This > ties into the remarks I just made to Amit about getting rid of > executor-startup lock-taking logic. Most of that change only makes > sense to do in v12, but I'd prefer that this patch not double down on > reliance on data structures we'd like to get rid of.) Right, but I need to use something for v11. Do you want to see two separate patches here? If we're not going to change this in v11 then I still need to use something to describe the partition hierarchy so that make_partition_pruneinfo() knows how to build the data structures for run-time pruning. > Second, I still feel that you've got the sense of the prunable-subplans > bitmaps backwards, and I do not buy the micro-optimization argument you > made for doing it like that. It complicates the code, yet the cost of > one bit in a bitmap is completely negligible compared to all the other > costs involved in having a per-partition subplan, whether or not that > subplan actually gets used in a particular run. But get_matching_partitions() returns the set of matching partitions, not the set that does not match. It sounds like doing it the way you ask would require inverting the Bitmapset returned by that. I don't quite understand why you think this is more simple to implement. I can't see how we'd map the non-matching partitions into subplan indexes for the Append node. Can you give a bit more detail on your design for this? > One very minor quibble is that I'd be inclined to represent the > PartitionPruningData struct like this: > > typedef struct PartitionPruningData > { > int num_partrelprunedata; > PartitionedRelPruningData partrelprunedata[FLEXIBLE_ARRAY_MEMBER]; > } PartitionPruningData; > > so we can allocate it with one palloc not two. That could be done, sort of. Only I currently allocate the array which stores these PartitionPruningDatas as one chunk of memory. I can do that today because the PartitionPruningData struct is a fixed size. If you want to make it have a flexible size then I'd need to allocate an array of pointers in the PartitionPruningState... or use a FLEXIBLE_ARRAY_MEMBER of pointers there too. I've done it that way locally for now. > Also, in the Plan > representation, I'd suggest avoiding situations where a data structure > is sometimes a List of Lists of PartitionedRelPruneInfo and sometimes > just a single-level List. Saving one ListCell isn't worth the code > complexity and error-proneness of that. I'll make that change. But I'm confused; was the first thing you mentioned above not to get rid of this list? -- David Rowley http://www.2ndQuadrant.com/ PostgreSQL Development, 24x7 Support, Training & Services
Re: Internal error XX000 with enable_partition_pruning=on, pg 11beta1 on Debian
On 16 July 2018 at 06:55, Tom Lane <tgl@sss.pgh.pa.us> wrote: > Also, in the Plan > representation, I'd suggest avoiding situations where a data structure > is sometimes a List of Lists of PartitionedRelPruneInfo and sometimes > just a single-level List. Saving one ListCell isn't worth the code > complexity and error-proneness of that. Thinking about this some more, I don't quite see any reason that the partitioned_rels for a single hierarchy couldn't just be a Bitmapset instead of an IntList. A parent partition is always going to have a lower relid than its children, so that means that the top level parent will just have the lowest member in the set. There's already code in the inheritance_planner which rebuilds the IntList from a Bitmapset: while ((i = bms_next_member(partitioned_relids, i)) >= 0) partitioned_rels = lappend_int(partitioned_rels, i); ExecLockNonLeafAppendTables could be made to accept a Bitmapset rather than a List. In fact, we could probably get rid of the nested loops if we did it that way. What do you think? -- David Rowley http://www.2ndQuadrant.com/ PostgreSQL Development, 24x7 Support, Training & Services
Re: Internal error XX000 with enable_partition_pruning=on, pg 11beta1 on Debian
On 16 July 2018 at 12:55, David Rowley <david.rowley@2ndquadrant.com> wrote: > Thinking about this some more, I don't quite see any reason that the > partitioned_rels for a single hierarchy couldn't just be a Bitmapset > instead of an IntList. Of course, this is not possible since we can't pass a List of Bitmapsets to the executor due to Bitmapset not being a node type. -- David Rowley http://www.2ndQuadrant.com/ PostgreSQL Development, 24x7 Support, Training & Services
RE: Internal error XX000 with enable_partition_pruning=on, pg 11beta1 on Debian
Hi,
I should post that in the general section but I am confused by the sentence "A parent partition is always going to have a lower relid than its children"
www.postgresql.org Michael Fuhr <mike(at)fuhr(dot)org> writes: > On Thu, Mar 24, 2005 at 11:01:23PM -0300, Edson Vilhena de Carvalho wrote: >> Can anyone tell me what is a relid, a relname and |
In fact, I want to be sure I can say to the developers they will always be able to create tables and partitions in any order :
create table midparent1(c1 int, c2 int) partition by list(c2);
alter table midparent1 attach partition child1 for values in (1);
create table child2 partition of midparent1 for values in (2);
create table topparent(c1 int, c2 int) partition by list(c1);
alter table topparent attach partition midparent1 for values in (1);
relname | relkind | oid
------------+---------+--------
child1 | r | 123989
midparent1 | p | 123992
child2 | r | 123995
topparent | p | 123998
(4 lignes)
Re: Internal error XX000 with enable_partition_pruning=on, pg 11beta1 on Debian
I should post that in the general section but I am confused by the sentence "A parent partition is always going to have a lower relid than its children"
RE: Internal error XX000 with enable_partition_pruning=on, pg 11beta1 on Debian
I get it. Thank you for this precision.
Regards
Phil
Envoyé : lundi 16 juillet 2018 07:48
À : Phil Florent
Cc : Tom Lane; Robert Haas; Amit Langote; PostgreSQL Hackers
Objet : Re: Internal error XX000 with enable_partition_pruning=on, pg 11 beta1 on Debian
I should post that in the general section but I am confused by the sentence "A parent partition is always going to have a lower relid than its children"
Re: Internal error XX000 with enable_partition_pruning=on, pg 11beta1 on Debian
On 16 July 2018 at 06:55, Tom Lane <tgl@sss.pgh.pa.us> wrote: > I started to look at this patch. I think this is basically the right > direction to go in, but I'm not terribly happy with the details of the > data structure design. I've made an attempt at addressing the issues that I understood. I've not done anything about your Bitmapset for non-matching partitions. I fail to see how that would improve the code. But please feel free to provide details of your design and I'll review it and let you know what I think about it. -- David Rowley http://www.2ndQuadrant.com/ PostgreSQL Development, 24x7 Support, Training & Services
Вложения
Re: Internal error XX000 with enable_partition_pruning=on, pg 11beta1 on Debian
On 2018-Jul-16, David Rowley wrote: > On 16 July 2018 at 12:55, David Rowley <david.rowley@2ndquadrant.com> wrote: > > Thinking about this some more, I don't quite see any reason that the > > partitioned_rels for a single hierarchy couldn't just be a Bitmapset > > instead of an IntList. > > Of course, this is not possible since we can't pass a List of > Bitmapsets to the executor due to Bitmapset not being a node type. Maybe we can just add a new node type that wraps a lone bitmapset. The naive implementation (just print out individual members) should be pretty straightforward; a more sophisticated one (print out the "words") may end up more compact or not depending on density, but much harder for debugging, and probably means a catversion bump when BITS_PER_BITMAPWORD is changed, so probably not a great idea anyway. I suppose the only reason we haven't done this yet is nobody has needed it. Sounds like its time has come. -- Álvaro Herrera https://www.2ndQuadrant.com/ PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services
Re: Internal error XX000 with enable_partition_pruning=on, pg 11beta1 on Debian
On 17 July 2018 at 12:21, David Rowley <david.rowley@2ndquadrant.com> wrote: > On 16 July 2018 at 06:55, Tom Lane <tgl@sss.pgh.pa.us> wrote: >> I started to look at this patch. I think this is basically the right >> direction to go in, but I'm not terribly happy with the details of the >> data structure design. > > I've made an attempt at addressing the issues that I understood. I've attached a patch intended for master which is just v2 based on post 5220bb7533. No other changes were made. -- David Rowley http://www.2ndQuadrant.com/ PostgreSQL Development, 24x7 Support, Training & Services
Вложения
Re: Internal error XX000 with enable_partition_pruning=on, pg 11beta1 on Debian
On 18 July 2018 at 06:01, Alvaro Herrera <alvherre@2ndquadrant.com> wrote: > On 2018-Jul-16, David Rowley wrote: > >> On 16 July 2018 at 12:55, David Rowley <david.rowley@2ndquadrant.com> wrote: >> > Thinking about this some more, I don't quite see any reason that the >> > partitioned_rels for a single hierarchy couldn't just be a Bitmapset >> > instead of an IntList. >> >> Of course, this is not possible since we can't pass a List of >> Bitmapsets to the executor due to Bitmapset not being a node type. > > Maybe we can just add a new node type that wraps a lone bitmapset. The > naive implementation (just print out individual members) should be > pretty straightforward; a more sophisticated one (print out the "words") > may end up more compact or not depending on density, but much harder for > debugging, and probably means a catversion bump when BITS_PER_BITMAPWORD > is changed, so probably not a great idea anyway. > > I suppose the only reason we haven't done this yet is nobody has needed > it. Sounds like its time has come. I don't mind doing the work if that's what's wanted, but I'd rather wait for Tom to provide a bit more input into this as he seems to have some ideas that I don't understand well enough to write code for. -- David Rowley http://www.2ndQuadrant.com/ PostgreSQL Development, 24x7 Support, Training & Services
Re: Internal error XX000 with enable_partition_pruning=on, pg 11beta1 on Debian
On 2018/07/19 22:03, David Rowley wrote: > v3-0001-Fix-run-time-partition-pruning-for-UNION-ALL-pare.patch Thanks for updating the patch. I studied this patch today and concluded that it's going a bit too far by carrying the nested partition pruning info structures from the planner all the way into the executor. I understood the root cause of this issue as that make_partition_pruneinfo trips when UNION ALL's parent subquery, instead of the actual individual partitioned root tables, is treated as the root parent rel when converting prunequals using appenrel_adjust_*. That happens because of a flattened partitioned_rels list whose members are all assumed by make_partition_pruneinfo to have the same root parent and that it is an actual partitioned table. That assumption fails in this case because the parent is really the UNION ALL subquery rel. I think the fix implemented in the patch by modifying allpaths.c is correct, whereby the partition hierarchies are preserved by having nested Lists of partitioned rels. So, the partitioned_rels List corresponding to UNION ALL subquery parent itself contains Lists corresponding to partitioned tables appearing under it. With that, make_partition_pruneinfo (actually, make_partitionedrel_pruneinfo in the patch) can correctly perform its work for every sub-List, because for each sub-List, it can identify the correct root partitioned table parent to use. But I don't think the result of make_partition_pruneinfo itself has to be List of PartitionedRelPruneInfo nested under PartitionPruneInfo. I gather that each PartitionPruneInfo corresponds to each root partitioned table and a PartitionedRelPruneInfo contains the actual pruning information, which is created for every partitioned table (non-leaf tables), including the root tables. I don't think such nesting is necessary. I think that just like flattened partitioned_rels list, we should put flattened list of PartitionedRelPruneInfo into the Append or MergeAppend plan. No need for nesting PartitionedRelPruneInfo under PartitionPruneInfo. We create a relid_subplan_map from the flattened list of sub-plans, where sub-plans of leaf partitions of different partitioned tables appear in the same list. Similarly, I think, we should create relid_subpart_map from the flattened list of partitioned_rels, where partitioned rel RT indexes coming from different partitioned tables appear in the same list. Currently relid_subpart_map seems to be constructed separately for each sub-List of nested partitioned_rels list, so while subplan_map of each PartitionedRelPruneInfo contains indexes into a global array of leaf partition sub-plans, subpart_map contains indexes into local array of PartitionedRelPruneInfo for that partitioned table. But, I think there is not a big hurdle in making even the latter contain indexes into global array of PartitionedRelPruneInfos of *all* partitioned tables. On the executor side, instead of having PartitionedRelPruningData be nested under PartitionPruningData, which in turn are stored in the top-level PartitionPruneState, store them directly in PartitionPruneState, since we're making planner put global indexes into subpart_map. Slight adjustment seems to be needed to make ExecInitFindMatchingSubPlans and ExecFindMatchingSubPlans skip the PartitionedRelPruningData of non-root tables, because find_matching_subplans_recurse takes care of recursing to non-root ones. Actually, not skipping them seems to cause wrong result. To verify that such an approach would actually work, I modified the relevant parts of your patch and confirmed that it does. See attached a delta patch. Thanks, Amit PS: Other than the main patch, I think it would be nice to add a RT index field to PartitionedRelPruneInfo in addition to the existing Oid field. It would help to identify and fetch the Relation from a hypothetical executor-local array of Relation pointers which is addressable by RT indexes.
Вложения
Re: Internal error XX000 with enable_partition_pruning=on, pg 11beta1 on Debian
On 20 July 2018 at 21:44, Amit Langote <Langote_Amit_f8@lab.ntt.co.jp> wrote: > But I don't think the result of make_partition_pruneinfo itself has to be > List of PartitionedRelPruneInfo nested under PartitionPruneInfo. I gather > that each PartitionPruneInfo corresponds to each root partitioned table > and a PartitionedRelPruneInfo contains the actual pruning information, > which is created for every partitioned table (non-leaf tables), including > the root tables. I don't think such nesting is necessary. I think that > just like flattened partitioned_rels list, we should put flattened list of > PartitionedRelPruneInfo into the Append or MergeAppend plan. No need for > nesting PartitionedRelPruneInfo under PartitionPruneInfo. To do that properly you'd need to mark the target partitioned table of each hierarchy. Your test of pg_class.relispartition is not going to work as you're assuming the query is always going to the root. The user might query some intermediate partitioned table (which will have relispartition = true). Your patch will fall flat in that case. You could work around that by having some array that points to the target partitioned table of each hierarchy, but I don't see why that's better than having the additional struct. There's also some code inside ExecFindInitialMatchingSubPlans() which does a backward scan over the partitions. This must process children before their parents. Unsure how well that's going to work if we start mixing the hierarchies. I'm sure it can be made to work providing each hierarchy is stored together consecutively in the array, but it just seems pretty fragile to me. That code is already pretty hard to follow. What's the reason you don't like the additional level to represent multiple hierarchies? -- David Rowley http://www.2ndQuadrant.com/ PostgreSQL Development, 24x7 Support, Training & Services
Re: Internal error XX000 with enable_partition_pruning=on, pg 11beta1 on Debian
On 2018/07/21 0:17, David Rowley wrote: > On 20 July 2018 at 21:44, Amit Langote <Langote_Amit_f8@lab.ntt.co.jp> wrote: >> But I don't think the result of make_partition_pruneinfo itself has to be >> List of PartitionedRelPruneInfo nested under PartitionPruneInfo. I gather >> that each PartitionPruneInfo corresponds to each root partitioned table >> and a PartitionedRelPruneInfo contains the actual pruning information, >> which is created for every partitioned table (non-leaf tables), including >> the root tables. I don't think such nesting is necessary. I think that >> just like flattened partitioned_rels list, we should put flattened list of >> PartitionedRelPruneInfo into the Append or MergeAppend plan. No need for >> nesting PartitionedRelPruneInfo under PartitionPruneInfo. > > To do that properly you'd need to mark the target partitioned table of > each hierarchy. Your test of pg_class.relispartition is not going to > work as you're assuming the query is always going to the root. The > user might query some intermediate partitioned table (which will have > relispartition = true). Your patch will fall flat in that case. Yeah, I forgot to consider that. > You could work around that by having some array that points to the > target partitioned table of each hierarchy, but I don't see why that's > better than having the additional struct. Or it could be a Bitmapset called root_indexes that stores the offset of the first Index value in every partitioned_rels list contained in turn in the list that's passed to make_partition_pruneinfo. > There's also some code > inside ExecFindInitialMatchingSubPlans() which does a backward scan > over the partitions. This must process children before their parents. > Unsure how well that's going to work if we start mixing the > hierarchies. I'm sure it can be made to work providing each hierarchy > is stored together consecutively in the array, but it just seems > pretty fragile to me. That code is already pretty hard to follow. I don't see how removing a nested loop changes things for worse. AIUI, the code replaces index values contained in the subplan_map arrays of various PartitionedRelPruningData structs to account for any pruned sub-plans. Removing a nesting level because of having removed the nesting struct doesn't seem to affect anything about that translation. But your point here seems to be about the relative ordering of PartitionedRelPruningData structs among themselves being affected due to their now being put into a flat array, although I don't see that as being any more fragile. We already are assuming a bunch about the relative ordering of sub-plans or of PartitionedRelPruningData structs to have been relying on storing their indexes in subplan_map and subpart_map. Also, it occurred to me that the new subplan indexes that ExecFindInitialMatchingSubPlans computes are based on where subplans are actually stored in the AppendState.appendplans array, which, in turn, is based on the Bitmapset of "valid subplans" that ExecFindInitialMatchingSubPlans passes back to ExecInitAppend. > What's the reason you don't like the additional level to represent > multiple hierarchies? I started thinking about flattening PartitionedRelPruneInfo after looking at flatten_partitioned_rels() in your patch. If we're flattening partitioned_rels (that is, not having it as a List of Lists in the finished plan), why not flatten the pruning info node too? As I said earlier, I get it that we need List of Lists within the planner to get make_partition_pruneinfo to work correctly in these types of queries, but once we have figured out the correct details to pass to executor to perform run-time pruning, I don't see why we need to pass that info again as a List of Lists. I have attached v2 of the delta patch which adds a root_indexes field to PartitionPruneInfo to track topmost parents in every partition hierarchy contained whose pruning info is contained in the Append. Also, I noticed a bug with how ExecFindInitialMatchingSubPlans handles other_subplans. While the indexes in subplan_map arrays are updated to contain revised values after pruning, those in the other_subplans Bitmapset are not, leading to crashes or possibly wrong result. For example: create table p (a int, b int, c int) partition by list (a); create table p1 partition of p for values in (1); create table p2 partition of p for values in (2); create table q (a int, b int, c int) partition by list (a); create table q1 partition of q for values in (1) partition by list (b); create table q11 partition of q1 for values in (1) partition by list (c); create table q111 partition of q11 for values in (1); create table q2 partition of q for values in (2) partition by list (b); create table q21 partition of q2 for values in (1); create table q22 partition of q2 for values in (2); prepare q (int, int) as select * from ( select * from p union all select * from q1 union all select 1, 1, 1 ) s(a, b, c) where s.a = $1 and s.b = $2 and s.c = (select 1); set plan_cache_mode TO force_generic_plan; explain (costs off, analyze) execute q (1, 1); server closed the connection unexpectedly This probably means the server terminated abnormally before or while processing the request. The connection to the server was lost. Attempting reset: Failed. I have attached a fix for that as a delta patch, which results in: explain (costs off, analyze) execute q (1, 1); QUERY PLAN ────────────────────────────────────────────────────────────────── Append (actual time=0.153..0.179 rows=1 loops=1) InitPlan 1 (returns $0) -> Result (actual time=0.023..0.032 rows=1 loops=1) Subplans Removed: 1 -> Seq Scan on p1 (actual time=0.022..0.022 rows=0 loops=1) Filter: ((a = $1) AND (b = $2) AND (c = $0)) -> Seq Scan on q111 (actual time=0.012..0.012 rows=0 loops=1) Filter: ((a = $1) AND (b = $2) AND (c = $0)) -> Result (actual time=0.014..0.022 rows=1 loops=1) One-Time Filter: ((1 = $1) AND (1 = $2) AND (1 = $0)) Planning Time: 8.136 ms Execution Time: 0.562 ms (12 rows) Thanks, Amit
Вложения
Re: Internal error XX000 with enable_partition_pruning=on, pg 11beta1 on Debian
On 20 July 2018 at 01:03, David Rowley <david.rowley@2ndquadrant.com> wrote: > I've attached a patch intended for master which is just v2 based on > post 5220bb7533. In [1] I mentioned that I think that bug should be fixed as part of this bug fix too. It just seems a little strange to fix that one separately when without the v3 patch for this fix the code still won't work correctly when any subplans are present which don't belong in the partition hierarchy. I've attached a patch which can be applied on top of the v3 patch. [1] https://www.postgresql.org/message-id/CAKJS1f9e7hHYP9SaT%3D-_RR4jdmm9VCgtDoC3-60s97EMjcfGWg%40mail.gmail.com -- David Rowley http://www.2ndQuadrant.com/ PostgreSQL Development, 24x7 Support, Training & Services
Вложения
[ getting back to this thread at last ] Amit Langote <Langote_Amit_f8@lab.ntt.co.jp> writes: > On 2018/07/21 0:17, David Rowley wrote: >> You could work around that by having some array that points to the >> target partitioned table of each hierarchy, but I don't see why that's >> better than having the additional struct. > Or it could be a Bitmapset called root_indexes that stores the offset of > the first Index value in every partitioned_rels list contained in turn in > the list that's passed to make_partition_pruneinfo. I'm unimpressed with this solution --- that's just another independent data structure that we'd have to keep in sync with the main one. (For instance, if we ever added/removed PartitionedRelPruningData structs in the list, we'd have to renumber that bitmapset's bits.) If we wanted to go that way, it would make much more sense to add an "is_root" boolean field to the PartitionedRelPruningData structs. However, I tend to agree with David that flattening the partitioning struct tree isn't actually a worthy goal to pursue. First, I don't see that it buys us much, and second, I'm afraid we'll just end up undoing it or else adding annotations that are morally equivalent to having the nested structure. > Also, I noticed a bug with how ExecFindInitialMatchingSubPlans handles > other_subplans. While the indexes in subplan_map arrays are updated to > contain revised values after pruning, those in the other_subplans > Bitmapset are not, leading to crashes or possibly wrong result. This is actually a lovely example of why I dislike having a bunch of auxiliary bitmapsets (or lists of ints) dangling off the plan tree. They're maintenance headaches. I would rather fix this problem by not having other_subplans in the first place. Or maybe we should get rid of the subplan-renumbering business: that looks like bugs waiting to happen, IMO, and I'm really unconvinced that it buys us anything that's worth the overhead of doing it. Off to study David's last patch in more detail. regards, tom lane
David Rowley <david.rowley@2ndquadrant.com> writes: > On 20 July 2018 at 01:03, David Rowley <david.rowley@2ndquadrant.com> wrote: >> I've attached a patch intended for master which is just v2 based on >> post 5220bb7533. I've pushed the v3 patch with a lot of editorial work (e.g. cleaning up comments you hadn't). I still want to think about getting rid of some of the "extraneous" bitmapsets and lists that are running around here ... but time grows short before beta3, and it's not clear that that would be appropriate material to push into v11 anyway. > In [1] I mentioned that I think that bug should be fixed as part of > this bug fix too. I didn't include this change because (a) it's late, (b) no test case was included, and (c) I don't entirely believe it anyway. How would a rel be both leaf and nonleaf? Isn't this indicative of a problem somewhere upstream in the planner? regards, tom lane
Re: Internal error XX000 with enable_partition_pruning=on, pg 11beta1 on Debian
On 2 August 2018 at 11:48, Tom Lane <tgl@sss.pgh.pa.us> wrote: > I've pushed the v3 patch with a lot of editorial work (e.g. cleaning > up comments you hadn't). Thanks for doing that. > >> In [1] I mentioned that I think that bug should be fixed as part of >> this bug fix too. > > I didn't include this change because (a) it's late, (b) no test > case was included, and (c) I don't entirely believe it anyway. > How would a rel be both leaf and nonleaf? Isn't this indicative > of a problem somewhere upstream in the planner? It's probably best discussed on the other thread, but it seems to be by design in accumulate_append_subpath(). Parallel Append nodes don't get flattened if they contain a mix of parallel aware and non-parallel aware subplans. So you might be right, maybe a better option is to have that code reorder the subplans so that the parallel aware ones stay at the end. I'm just not convinced that fixing that code means it would mean it would never happen again. It does not seem too outrageous to me to support nested Appends, and with those, what else would the Path's parent RelOptInfo be if it's not the partitioned table that the node's subpaths belong to? Or maybe, in that case, the partitioned_rels belonging to the sub-Append should not have been included in the List for the top-level Append. -- David Rowley http://www.2ndQuadrant.com/ PostgreSQL Development, 24x7 Support, Training & Services
Re: Internal error XX000 with enable_partition_pruning=on, pg 11beta1 on Debian
On 2018-Aug-01, Tom Lane wrote: > David Rowley <david.rowley@2ndquadrant.com> writes: > > On 20 July 2018 at 01:03, David Rowley <david.rowley@2ndquadrant.com> wrote: > >> I've attached a patch intended for master which is just v2 based on > >> post 5220bb7533. > > I've pushed the v3 patch with a lot of editorial work (e.g. cleaning > up comments you hadn't). Thanks Tom, much appreciated. -- Álvaro Herrera https://www.2ndQuadrant.com/ PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services
Re: Internal error XX000 with enable_partition_pruning=on, pg 11beta1 on Debian
CREATE TABLE pt (a INT, b INT, c INT) PARTITION BY RANGE(a);
CREATE TABLE pt_p1 PARTITION OF pt FOR VALUES FROM (1) to (6) PARTITION BY RANGE (b);
CREATE TABLE pt_p1_p1 PARTITION OF pt_p1 FOR VALUES FROM (11) to (44);
CREATE TABLE pt_p1_p2 PARTITION OF pt_p1 FOR VALUES FROM (44) to (66);
INSERT INTO pt (a,b,c) VALUES (1,11,111),(2,22,222),(3,33,333),(4,44,444),(5,55,555);
-- rule on root partition to first level child,
CREATE RULE pt_rule_ptp1 AS ON UPDATE TO pt DO INSTEAD UPDATE pt_p1 SET a = new.a WHERE a = old.a;
ERROR: child rel 1 not found in append_rel_array
commit 1b54e91faabf3764b6786915881e514e42dccf89
Author: Tom Lane <tgl@sss.pgh.pa.us>
Date: Wed Aug 1 19:42:46 2018 -0400
Fix run-time partition pruning for appends with multiple source rels.
attribute and end up with error from find_appinfos_by_relids().
* The prunequal is presented to us as a qual for 'parentrel'.
* Frequently this rel is the same as targetpart, so we can skip
* an adjust_appendrel_attrs step. But it might not be, and then
* we have to translate. We update the prunequal parameter here,
* because in later iterations of the loop for child partitions,
* we want to translate from parent to child variables.
*/
if (parentrel != subpart)
{
int nappinfos;
AppendRelInfo **appinfos = find_appinfos_by_relids(root,
subpart->relids,
&nappinfos);
prunequal = (List *) adjust_appendrel_attrs(root, (Node *)
prunequal,
nappinfos,
appinfos);
pfree(appinfos);
}
Regards,
On 2018-Aug-01, Tom Lane wrote:
> David Rowley <david.rowley@2ndquadrant.com> writes:
> > On 20 July 2018 at 01:03, David Rowley <david.rowley@2ndquadrant.com> wrote:
> >> I've attached a patch intended for master which is just v2 based on
> >> post 5220bb7533.
>
> I've pushed the v3 patch with a lot of editorial work (e.g. cleaning
> up comments you hadn't).
Thanks Tom, much appreciated.
--
Álvaro Herrera https://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services
--
Rushabh Lathia <rushabh.lathia@gmail.com> writes: > Consider the below case: I initially thought the rule might be messing stuff up, but you can get the same result without the rule by writing out the transformed query by hand: regression=# explain UPDATE pt_p1 SET a = 3 from pt WHERE pt.a = 2 and pt.a = pt_p1.a; ERROR: child rel 2 not found in append_rel_array With enable_partition_pruning=off this goes through without an error. I suspect the join pruning stuff is getting confused by the overlap between the two partitioning trees involved in the join; although the fact that one of them is the target rel must be related too, because if you just write a SELECT for this join it's fine. I rather doubt that this case worked before 1b54e91fa ... no time to look closer today, though. regards, tom lane
Re: Internal error XX000 with enable_partition_pruning=on, pg 11beta1 on Debian
On 2018/08/08 8:09, Tom Lane wrote: > Rushabh Lathia <rushabh.lathia@gmail.com> writes: >> Consider the below case: > > I initially thought the rule might be messing stuff up, but you can get > the same result without the rule by writing out the transformed query > by hand: > > regression=# explain UPDATE pt_p1 SET a = 3 from pt > WHERE pt.a = 2 and pt.a = pt_p1.a; > ERROR: child rel 2 not found in append_rel_array > > With enable_partition_pruning=off this goes through without an error. > > I suspect the join pruning stuff is getting confused by the overlap > between the two partitioning trees involved in the join; although the > fact that one of them is the target rel must be related too, because > if you just write a SELECT for this join it's fine. > > I rather doubt that this case worked before 1b54e91fa ... no time > to look closer today, though. The code pointed out by Rushabh indeed seems to be the culprit in this case. /* * The prunequal is presented to us as a qual for 'parentrel'. * Frequently this rel is the same as targetpart, so we can skip * an adjust_appendrel_attrs step. But it might not be, and then * we have to translate. We update the prunequal parameter here, * because in later iterations of the loop for child partitions, * we want to translate from parent to child variables. */ if (parentrel != subpart) { int nappinfos; AppendRelInfo **appinfos = find_appinfos_by_relids(root, subpart->relids, &nappinfos); prunequal = (List *) adjust_appendrel_attrs(root, (Node *) prunequal, nappinfos, appinfos); pfree(appinfos); } This code is looking for the case where we have to translate prunequal's Vars from UNION ALL parent varno to actual partitioned parent's varno, but the detection test (if (parentrel != subpart)) turns out to be unreliable, as shown by this report. I think the test should be if (parent->relid != subpart->relid). Comparing pointers as is done now is fine without inheritance_planner being involved, but not when it *is* involved, because of the following piece of code in it that overwrites RelOptInfos: /* * We need to collect all the RelOptInfos from all child plans into * the main PlannerInfo, since setrefs.c will need them. We use the * last child's simple_rel_array (previous ones are too short), so we * have to propagate forward the RelOptInfos that were already built * in previous children. */ Assert(subroot->simple_rel_array_size >= save_rel_array_size); for (rti = 1; rti < save_rel_array_size; rti++) { RelOptInfo *brel = save_rel_array[rti]; if (brel) subroot->simple_rel_array[rti] = brel; } save_rel_array_size = subroot->simple_rel_array_size; save_rel_array = subroot->simple_rel_array; save_append_rel_array = subroot->append_rel_array; With this, the RelOptInfos that would've been used to create Paths that are currently under the subroot's final rel's best path, would no longer be accessible through that subroot, because they're overwritten by the corresponding ones in save_rel_array. Subsequently, create_modifytable_plan() passes the 'subroot' to create_plan_recurse() that will recurse down to make_parttionedrel_pruneinfo() via create_append_plan(). The 'parentrel' RelOptInfo that's fetched off of AppendPath is no longer reachable from 'subroot', because it's been overwritten as mentioned above. 'subpart', the RelOptInfo (of the same RT index) fetched from 'subroot' is thus not the same as 'parentrel'. So, the if (parentrel != subpart) test is mistakenly satisfied, leading to the failure of finding the needed AppendRelInfo, which makes sense, because 'subpart' is not really a child of anything. Attached is a patch which modifies the if test to compare relids instead of RelOptInfo pointers. Thanks, Amit
Вложения
Re: Internal error XX000 with enable_partition_pruning=on, pg 11beta1 on Debian
On 8 August 2018 at 17:28, Amit Langote <Langote_Amit_f8@lab.ntt.co.jp> wrote: > Attached is a patch which modifies the if test to compare relids instead > of RelOptInfo pointers. Thanks for investigating and writing a patch. I agree with the fix. It's probably worth writing a test that performs run-time pruning from an inheritance planner plan. Do you want to include that in your patch? If not, I can. -- David Rowley http://www.2ndQuadrant.com/ PostgreSQL Development, 24x7 Support, Training & Services
David Rowley <david.rowley@2ndquadrant.com> writes: > On 8 August 2018 at 17:28, Amit Langote <Langote_Amit_f8@lab.ntt.co.jp> wrote: >> Attached is a patch which modifies the if test to compare relids instead >> of RelOptInfo pointers. > Thanks for investigating and writing a patch. I agree with the fix. I changed this to compare the relid sets not just rel->relid, since rel->relid is only reliable for baserels. The partitioned rel could safely be assumed to be a baserel, but I'm less comfortable with supposing that the parentrel always will be. Otherwise, added a test case based on Rushabh's example and pushed. (I'm not quite sure if the plan will be stable enough to satisfy the buildfarm, but we'll soon find out ...) regards, tom lane
Re: Internal error XX000 with enable_partition_pruning=on, pg 11beta1 on Debian
On 2018/08/09 0:48, Tom Lane wrote: > David Rowley <david.rowley@2ndquadrant.com> writes: >> On 8 August 2018 at 17:28, Amit Langote <Langote_Amit_f8@lab.ntt.co.jp> wrote: >>> Attached is a patch which modifies the if test to compare relids instead >>> of RelOptInfo pointers. > >> Thanks for investigating and writing a patch. I agree with the fix. > > I changed this to compare the relid sets not just rel->relid, since > rel->relid is only reliable for baserels. The partitioned rel could > safely be assumed to be a baserel, but I'm less comfortable with > supposing that the parentrel always will be. Otherwise, added a > test case based on Rushabh's example and pushed. (I'm not quite > sure if the plan will be stable enough to satisfy the buildfarm, > but we'll soon find out ...) Thank you for committing, agreed that comparing relid sets for equality might be more future-proof. About the test case, wondering if we should, like David seemed to suggest, add a test case that would actually use run-time pruning? Maybe, even better if the new test also had partitioned parent under UNION ALL parent under ModifyTable. Something like in the attached? One reason why we should adapt such a test case is that, in the future, we may arrange for make_partitionedrel_pruneinfo(), whose code we just fixed, to not be called if we know that run-time pruning is not needed. It seems that that's true for the test added by the commit, that is, it doesn't need run-time pruning. Regards, Amit
Вложения
Amit Langote <Langote_Amit_f8@lab.ntt.co.jp> writes: > One reason why we should adapt such a test case is that, in the future, we > may arrange for make_partitionedrel_pruneinfo(), whose code we just fixed, > to not be called if we know that run-time pruning is not needed. It seems > that that's true for the test added by the commit, that is, it doesn't > need run-time pruning. Not following your argument here. Isn't make_partition_pruneinfo precisely what is in charge of figuring out whether run-time pruning is possible? (See my point in the other thread about Jaime's assertion crash, that no run-time pruning actually would be possible for that query. But we got to the assertion failure anyway.) regards, tom lane
Re: Internal error XX000 with enable_partition_pruning=on, pg 11beta1 on Debian
On 2018/08/09 13:00, Tom Lane wrote: > Amit Langote <Langote_Amit_f8@lab.ntt.co.jp> writes: >> One reason why we should adapt such a test case is that, in the future, we >> may arrange for make_partitionedrel_pruneinfo(), whose code we just fixed, >> to not be called if we know that run-time pruning is not needed. It seems >> that that's true for the test added by the commit, that is, it doesn't >> need run-time pruning. > > Not following your argument here. Isn't make_partition_pruneinfo > precisely what is in charge of figuring out whether run-time pruning > is possible? With the current coding, yes, it is... > (See my point in the other thread about Jaime's assertion crash, > that no run-time pruning actually would be possible for that query. > But we got to the assertion failure anyway.) The first time I'd seen that make_partition_pruneinfo *always* gets called from create_append_plan if rel->baserestrictinfo is non-NIL, I had wondered whether we couldn't avoid doing it for the cases for which we'll end up throwing away all that work anyway. But looking at the code now, it may be a bit hard -- analyze_partkey_exprs(), which determines whether we'll need any execution-time pruning, could not be called any sooner. So, okay, I have to admit that my quoted argument isn't that strong. Thanks, Amit
RE: Internal error XX000 with enable_partition_pruning=on, pg 11beta1 on Debian
Envoyé : jeudi 9 août 2018 06:35
À : Tom Lane
Cc : David Rowley; Rushabh Lathia; Alvaro Herrera; Robert Haas; Phil Florent; PostgreSQL Hackers
Objet : Re: Internal error XX000 with enable_partition_pruning=on, pg 11 beta1 on Debian
> Amit Langote <Langote_Amit_f8@lab.ntt.co.jp> writes:
>> One reason why we should adapt such a test case is that, in the future, we
>> may arrange for make_partitionedrel_pruneinfo(), whose code we just fixed,
>> to not be called if we know that run-time pruning is not needed. It seems
>> that that's true for the test added by the commit, that is, it doesn't
>> need run-time pruning.
>
> Not following your argument here. Isn't make_partition_pruneinfo
> precisely what is in charge of figuring out whether run-time pruning
> is possible?
With the current coding, yes, it is...
> (See my point in the other thread about Jaime's assertion crash,
> that no run-time pruning actually would be possible for that query.
> But we got to the assertion failure anyway.)
The first time I'd seen that make_partition_pruneinfo *always* gets called
from create_append_plan if rel->baserestrictinfo is non-NIL, I had
wondered whether we couldn't avoid doing it for the cases for which we'll
end up throwing away all that work anyway. But looking at the code now,
it may be a bit hard -- analyze_partkey_exprs(), which determines whether
we'll need any execution-time pruning, could not be called any sooner.
So, okay, I have to admit that my quoted argument isn't that strong.
Thanks,
Amit