// Copyright 2015 The Cockroach Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package sql

import (
	"context"
	"fmt"
	"math"
	"math/rand"
	"strings"
	"time"

	"github.com/pkg/errors"

	"github.com/cockroachdb/cockroach/pkg/config"
	"github.com/cockroachdb/cockroach/pkg/gossip"
	"github.com/cockroachdb/cockroach/pkg/internal/client"
	"github.com/cockroachdb/cockroach/pkg/jobs"
	"github.com/cockroachdb/cockroach/pkg/jobs/jobspb"
	"github.com/cockroachdb/cockroach/pkg/keys"
	"github.com/cockroachdb/cockroach/pkg/kv"
	"github.com/cockroachdb/cockroach/pkg/roachpb"
	"github.com/cockroachdb/cockroach/pkg/settings"
	"github.com/cockroachdb/cockroach/pkg/settings/cluster"
	"github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgerror"
	"github.com/cockroachdb/cockroach/pkg/sql/sem/tree"
	"github.com/cockroachdb/cockroach/pkg/sql/sessiondata"
	"github.com/cockroachdb/cockroach/pkg/sql/sqlbase"
	"github.com/cockroachdb/cockroach/pkg/util/encoding"
	"github.com/cockroachdb/cockroach/pkg/util/grpcutil"
	"github.com/cockroachdb/cockroach/pkg/util/hlc"
	"github.com/cockroachdb/cockroach/pkg/util/log"
	"github.com/cockroachdb/cockroach/pkg/util/log/logtags"
	"github.com/cockroachdb/cockroach/pkg/util/retry"
	"github.com/cockroachdb/cockroach/pkg/util/stop"
	"github.com/cockroachdb/cockroach/pkg/util/timeutil"
	"github.com/cockroachdb/cockroach/pkg/util/tracing"
)

var schemaChangeLeaseDuration = settings.RegisterNonNegativeDurationSetting(
	"schemachanger.lease.duration",
	"the duration of a schema change lease",
	time.Minute*5,
)

var schemaChangeLeaseRenewFraction = settings.RegisterFloatSetting(
	"schemachanger.lease.renew_fraction",
	"the fraction of schemachanger.lease_duration remaining to trigger a renew of the lease",
	0.4,
)

// This is a delay [0.9 * asyncSchemaChangeDelay, 1.1 * asyncSchemaChangeDelay)
// added to an attempt to run a schema change via the asynchronous path.
// This delay allows the synchronous path to execute the schema change
// in all likelihood. We'd like the synchronous path to execute
// the schema change so that it doesn't have to poll and wait for
// another node to execute the schema change. Polling can add a polling
// delay to the normal execution of a schema change. This interval is also
// used to reattempt execution of a schema change. We don't want this to
// be too low because once a node has started executing a schema change
// the other nodes should not cause a storm by rapidly try to grab the
// schema change lease.
//
// TODO(mjibson): Refine the job coordinator to elect a new job coordinator
// on coordinator failure without causing a storm of polling requests
// attempting to become the job coordinator.
const asyncSchemaChangeDelay = 1 * time.Minute

// SchemaChanger is used to change the schema on a table.
type SchemaChanger struct {
	tableID    sqlbase.ID
	mutationID sqlbase.MutationID
	nodeID     roachpb.NodeID
	db         *client.DB
	leaseMgr   *LeaseManager
	// The SchemaChangeManager can attempt to execute this schema
	// changer after this time.
	execAfter time.Time

	// table.DropTime.
	dropTime int64

	testingKnobs   *SchemaChangerTestingKnobs
	distSQLPlanner *DistSQLPlanner
	jobRegistry    *jobs.Registry
	// Keep a reference to the job related to this schema change
	// so that we don't need to read the job again while updating
	// the status of the job. This job can be one of two jobs: the
	// original schema change job for the sql command, or the
	// rollback job for the rollback of the schema change.
	job *jobs.Job
	// Caches updated by DistSQL.
	rangeDescriptorCache *kv.RangeDescriptorCache
	leaseHolderCache     *kv.LeaseHolderCache
	clock                *hlc.Clock
	settings             *cluster.Settings
	execCfg              *ExecutorConfig
}

// NewSchemaChangerForTesting only for tests.
func NewSchemaChangerForTesting(
	tableID sqlbase.ID,
	mutationID sqlbase.MutationID,
	nodeID roachpb.NodeID,
	db client.DB,
	leaseMgr *LeaseManager,
	jobRegistry *jobs.Registry,
	execCfg *ExecutorConfig,
	settings *cluster.Settings,
) SchemaChanger {
	return SchemaChanger{
		tableID:     tableID,
		mutationID:  mutationID,
		nodeID:      nodeID,
		db:          &db,
		leaseMgr:    leaseMgr,
		jobRegistry: jobRegistry,
		settings:    settings,
		execCfg:     execCfg,
	}
}

func (sc *SchemaChanger) createSchemaChangeLease() sqlbase.TableDescriptor_SchemaChangeLease {
	return sqlbase.TableDescriptor_SchemaChangeLease{
		NodeID: sc.nodeID,
		ExpirationTime: timeutil.Now().Add(
			schemaChangeLeaseDuration.Get(&sc.settings.SV),
		).UnixNano(),
	}
}

// isPermanentSchemaChangeError returns true if the error results in
// a permanent failure of a schema change. This function is a whitelist
// instead of a blacklist: only known safe errors are confirmed to not be
// permanent errors. Anything unknown is assumed to be permanent.
func isPermanentSchemaChangeError(err error) bool {
	if err == nil {
		return false
	}
	err = errors.Cause(err)

	if grpcutil.IsClosedConnection(err) {
		return false
	}

	// Ignore error thrown because of a read at a very old timestamp.
	// The Backfill will grab a new timestamp to read at for the rest
	// of the backfill.
	if strings.Contains(err.Error(), "must be after GC threshold") {
		return false
	}

	switch err {
	case
		context.Canceled,
		context.DeadlineExceeded,
		errExistingSchemaChangeLease,
		errExpiredSchemaChangeLease,
		errNotHitGCTTLDeadline,
		errSchemaChangeDuringDrain,
		errSchemaChangeNotFirstInLine:
		return false
	}
	switch err := err.(type) {
	case errTableVersionMismatch:
		return false
	case *pgerror.Error:
		switch err.Code {
		case pgerror.CodeSerializationFailureError, pgerror.CodeConnectionFailureError:
			return false

		case pgerror.CodeInternalError:
			if err.Message == context.DeadlineExceeded.Error() {
				return false
			}

		}
	}

	return true
}

var (
	errExistingSchemaChangeLease  = errors.New("an outstanding schema change lease exists")
	errExpiredSchemaChangeLease   = errors.New("the schema change lease has expired")
	errSchemaChangeNotFirstInLine = errors.New("schema change not first in line")
	errNotHitGCTTLDeadline        = errors.New("not hit gc ttl deadline")
	errSchemaChangeDuringDrain    = errors.New("a schema change ran during the drain phase, re-increment")
)

func shouldLogSchemaChangeError(err error) bool {
	return err != errExistingSchemaChangeLease &&
		err != errSchemaChangeNotFirstInLine &&
		err != errNotHitGCTTLDeadline
}

type errTableVersionMismatch struct {
	version  sqlbase.DescriptorVersion
	expected sqlbase.DescriptorVersion
}

func makeErrTableVersionMismatch(version, expected sqlbase.DescriptorVersion) error {
	return errors.WithStack(errTableVersionMismatch{
		version:  version,
		expected: expected,
	})
}

func (e errTableVersionMismatch) Error() string {
	return fmt.Sprintf("table version mismatch: %d, expected: %d", e.version, e.expected)
}

// AcquireLease acquires a schema change lease on the table if
// an unexpired lease doesn't exist. It returns the lease.
func (sc *SchemaChanger) AcquireLease(
	ctx context.Context,
) (sqlbase.TableDescriptor_SchemaChangeLease, error) {
	var lease sqlbase.TableDescriptor_SchemaChangeLease
	err := sc.db.Txn(ctx, func(ctx context.Context, txn *client.Txn) error {
		if err := txn.SetSystemConfigTrigger(); err != nil {
			return err
		}
		tableDesc, err := sqlbase.GetTableDescFromID(ctx, txn, sc.tableID)
		if err != nil {
			return err
		}

		// A second to deal with the time uncertainty across nodes.
		// It is perfectly valid for two or more goroutines to hold a valid
		// lease and execute a schema change in parallel, because schema
		// changes are executed using transactions that run sequentially.
		// This just reduces the probability of a write collision.
		expirationTimeUncertainty := time.Second

		if tableDesc.Lease != nil {
			if timeutil.Unix(0, tableDesc.Lease.ExpirationTime).Add(expirationTimeUncertainty).After(timeutil.Now()) {
				return errExistingSchemaChangeLease
			}
			log.Infof(ctx, "Overriding existing expired lease %v", tableDesc.Lease)
		}
		lease = sc.createSchemaChangeLease()
		tableDesc.Lease = &lease
		return txn.Put(ctx, sqlbase.MakeDescMetadataKey(tableDesc.ID), sqlbase.WrapDescriptor(tableDesc))
	})
	return lease, err
}

func (sc *SchemaChanger) findTableWithLease(
	ctx context.Context, txn *client.Txn, lease sqlbase.TableDescriptor_SchemaChangeLease,
) (*sqlbase.TableDescriptor, error) {
	tableDesc, err := sqlbase.GetTableDescFromID(ctx, txn, sc.tableID)
	if err != nil {
		return nil, err
	}
	if tableDesc.Lease == nil {
		return nil, errors.Errorf("no lease present for tableID: %d", sc.tableID)
	}
	if *tableDesc.Lease != lease {
		log.Errorf(ctx, "table: %d has lease: %v, expected: %v", sc.tableID, tableDesc.Lease, lease)
		return nil, errExpiredSchemaChangeLease
	}
	return tableDesc, nil
}

// ReleaseLease releases the table lease if it is the one registered with
// the table descriptor.
func (sc *SchemaChanger) ReleaseLease(
	ctx context.Context, lease sqlbase.TableDescriptor_SchemaChangeLease,
) error {
	return sc.db.Txn(ctx, func(ctx context.Context, txn *client.Txn) error {
		tableDesc, err := sc.findTableWithLease(ctx, txn, lease)
		if err != nil {
			return err
		}
		tableDesc.Lease = nil
		if err := txn.SetSystemConfigTrigger(); err != nil {
			return err
		}
		return txn.Put(ctx, sqlbase.MakeDescMetadataKey(tableDesc.ID), sqlbase.WrapDescriptor(tableDesc))
	})
}

// ExtendLease for the current leaser. This needs to be called often while
// doing a schema change to prevent more than one node attempting to apply a
// schema change (which is still safe, but unwise). It updates existingLease
// with the new lease.
func (sc *SchemaChanger) ExtendLease(
	ctx context.Context, existingLease *sqlbase.TableDescriptor_SchemaChangeLease,
) error {
	// Check if there is still time on this lease.
	minDuration := time.Duration(float64(schemaChangeLeaseDuration.Get(&sc.settings.SV)) *
		schemaChangeLeaseRenewFraction.Get(&sc.settings.SV))
	if timeutil.Unix(0, existingLease.ExpirationTime).After(timeutil.Now().Add(minDuration)) {
		return nil
	}
	// Update lease.
	var lease sqlbase.TableDescriptor_SchemaChangeLease
	if err := sc.db.Txn(ctx, func(ctx context.Context, txn *client.Txn) error {
		tableDesc, err := sc.findTableWithLease(ctx, txn, *existingLease)
		if err != nil {
			return err
		}

		lease = sc.createSchemaChangeLease()
		tableDesc.Lease = &lease
		if err := txn.SetSystemConfigTrigger(); err != nil {
			return err
		}
		return txn.Put(ctx, sqlbase.MakeDescMetadataKey(tableDesc.ID), sqlbase.WrapDescriptor(tableDesc))
	}); err != nil {
		return err
	}
	*existingLease = lease
	return nil
}

// DropTableDesc removes a descriptor from the KV database.
func DropTableDesc(
	ctx context.Context, tableDesc *sqlbase.TableDescriptor, db *client.DB, traceKV bool,
) error {
	descKey := sqlbase.MakeDescMetadataKey(tableDesc.ID)
	zoneKeyPrefix := config.MakeZoneKeyPrefix(uint32(tableDesc.ID))

	// Finished deleting all the table data, now delete the table meta data.
	return db.Txn(ctx, func(ctx context.Context, txn *client.Txn) error {
		// Delete table descriptor
		b := &client.Batch{}
		if traceKV {
			log.VEventf(ctx, 2, "Del %s", descKey)
			log.VEventf(ctx, 2, "DelRange %s", zoneKeyPrefix)
		}
		// Delete the descriptor.
		b.Del(descKey)
		// Delete the zone config entry for this table.
		b.DelRange(zoneKeyPrefix, zoneKeyPrefix.PrefixEnd(), false /* returnKeys */)
		if err := txn.SetSystemConfigTrigger(); err != nil {
			return err
		}
		return txn.Run(ctx, b)
	})
}

// truncateTable deletes all of the data in the specified table.
func (sc *SchemaChanger) truncateTable(
	ctx context.Context,
	lease *sqlbase.TableDescriptor_SchemaChangeLease,
	table *sqlbase.TableDescriptor,
	evalCtx *extendedEvalContext,
) error {
	// If DropTime isn't set, assume this drop request is from a version
	// 1.1 server and invoke legacy code that uses DeleteRange and range GC.
	if table.DropTime == 0 {
		return truncateTableInChunks(ctx, table, sc.db, false /* traceKV */)
	}

	tableKey := roachpb.RKey(keys.MakeTablePrefix(uint32(table.ID)))
	tableSpan := roachpb.RSpan{Key: tableKey, EndKey: tableKey.PrefixEnd()}

	// ClearRange requests lays down RocksDB range deletion tombstones that have
	// serious performance implications (#24029). The logic below attempts to
	// bound the number of tombstones in one store by sending the ClearRange
	// requests to each range in the table in small, sequential batches rather
	// than letting DistSender send them all in parallel, to hopefully give the
	// compaction queue time to compact the range tombstones away in between
	// requests.
	//
	// As written, this approach has several deficiencies. It does not actually
	// wait for the compaction queue to compact the tombstones away before
	// sending the next request. It is likely insufficient if multiple DROP
	// TABLEs are in flight at once. It does not save its progress in case the
	// coordinator goes down. These deficiences could be addressed, but this code
	// was originally a stopgap to avoid the range tombstone performance hit. The
	// RocksDB range tombstone implementation has since been improved and the
	// performance implications of many range tombstones has been reduced
	// dramatically making this simplistic throttling sufficient.

	// These numbers were chosen empirically for the clearrange roachtest and
	// could certainly use more tuning.
	const batchSize = 100
	const waitTime = 500 * time.Millisecond

	var n int
	lastKey := tableSpan.Key
	ri := kv.NewRangeIterator(sc.execCfg.DistSender)
	for ri.Seek(ctx, tableSpan.Key, kv.Ascending); ; ri.Next(ctx) {
		if !ri.Valid() {
			return ri.Error().GoError()
		}

		// This call is a no-op unless the lease is nearly expired.
		if err := sc.ExtendLease(ctx, lease); err != nil {
			return err
		}

		if n++; n >= batchSize || !ri.NeedAnother(tableSpan) {
			endKey := ri.Desc().EndKey
			if tableSpan.EndKey.Less(endKey) {
				endKey = tableSpan.EndKey
			}
			var b client.Batch
			b.AddRawRequest(&roachpb.ClearRangeRequest{
				RequestHeader: roachpb.RequestHeader{
					Key:    lastKey.AsRawKey(),
					EndKey: endKey.AsRawKey(),
				},
			})
			log.VEventf(ctx, 2, "ClearRange %s - %s", lastKey, endKey)
			if err := sc.db.Run(ctx, &b); err != nil {
				return err
			}
			n = 0
			lastKey = endKey
			time.Sleep(waitTime)
		}

		if !ri.NeedAnother(tableSpan) {
			break
		}
	}

	return nil
}

// maybe Add/Drop a table depending on the state of a table descriptor.
// This method returns true if the table is deleted.
func (sc *SchemaChanger) maybeAddDrop(
	ctx context.Context,
	inSession bool,
	lease *sqlbase.TableDescriptor_SchemaChangeLease,
	table *sqlbase.TableDescriptor,
	evalCtx *extendedEvalContext,
) (bool, error) {
	if table.Dropped() {
		if err := sc.ExtendLease(ctx, lease); err != nil {
			return false, err
		}

		if inSession {
			return false, nil
		}

		// This can happen if a change other than the drop originally
		// scheduled the changer for this table. If that's the case,
		// we still need to wait for the deadline to expire.
		if table.DropTime != 0 {
			var timeRemaining time.Duration
			if err := sc.db.Txn(ctx, func(ctx context.Context, txn *client.Txn) error {
				timeRemaining = 0
				_, zoneCfg, _, err := GetZoneConfigInTxn(
					ctx, txn, uint32(table.ID), &sqlbase.IndexDescriptor{}, "", false,
				)
				if err != nil {
					return err
				}
				deadline := table.DropTime + int64(zoneCfg.GC.TTLSeconds)*time.Second.Nanoseconds()
				timeRemaining = timeutil.Since(timeutil.Unix(0, deadline))
				return nil
			}); err != nil {
				return false, err
			}
			if timeRemaining < 0 {
				return false, errNotHitGCTTLDeadline
			}
		}
		// Do all the hard work of deleting the table data and the table ID.
		if err := sc.truncateTable(ctx, lease, table, evalCtx); err != nil {
			return false, err
		}

		return true, DropTableDesc(ctx, table, sc.db, false /* traceKV */)
	}

	if table.Adding() {
		for _, idx := range table.AllNonDropIndexes() {
			if idx.ForeignKey.IsSet() {
				if err := sc.waitToUpdateLeases(ctx, idx.ForeignKey.Table); err != nil {
					return false, err
				}
			}
		}

		if _, err := sc.leaseMgr.Publish(
			ctx,
			table.ID,
			func(tbl *sqlbase.TableDescriptor) error {
				tbl.State = sqlbase.TableDescriptor_PUBLIC
				return nil
			},
			func(txn *client.Txn) error { return nil },
		); err != nil {
			return false, err
		}
	}

	return false, nil
}

// Drain old names from the cluster.
func (sc *SchemaChanger) drainNames(
	ctx context.Context, lease *sqlbase.TableDescriptor_SchemaChangeLease,
) error {
	if err := sc.ExtendLease(ctx, lease); err != nil {
		return err
	}

	// Publish a new version with all the names drained after everyone
	// has seen the version with the new name. All the draining names
	// can be reused henceforth.
	var namesToReclaim []sqlbase.TableDescriptor_NameInfo
	_, err := sc.leaseMgr.Publish(
		ctx,
		sc.tableID,
		func(desc *sqlbase.TableDescriptor) error {
			if sc.testingKnobs.OldNamesDrainedNotification != nil {
				sc.testingKnobs.OldNamesDrainedNotification()
			}
			// Check that another schema change didn't run during the
			// drain phase. This ensures that we don't reclaim old names
			// here without explicitly going through a drain phase for the
			// names. On seeing a need to increment the version return an
			// error, so that sc.exec() is reexecuted and increments the
			// version before coming back here to correctly drain the names.
			if desc.UpVersion {
				return errSchemaChangeDuringDrain
			}
			// Free up the old name(s) for reuse.
			namesToReclaim = desc.DrainingNames
			desc.DrainingNames = nil
			return nil
		},
		// Reclaim all the old names.
		func(txn *client.Txn) error {
			b := txn.NewBatch()
			for _, drain := range namesToReclaim {
				tbKey := tableKey{drain.ParentID, drain.Name}.Key()
				b.Del(tbKey)
			}
			return txn.Run(ctx, b)
		},
	)
	return err
}

// Execute the entire schema change in steps.
// inSession is set to false when this is called from the asynchronous
// schema change execution path.
//
// If the txn that queued the schema changer did not commit, this will be a
// no-op, as we'll fail to find the job for our mutation in the jobs registry.
func (sc *SchemaChanger) exec(
	ctx context.Context, inSession bool, evalCtx *extendedEvalContext,
) error {
	ctx = logtags.AddTag(ctx, "scExec", nil)
	if log.V(2) {
		log.Infof(ctx, "exec pending schema change; table: %d, mutation: %d",
			sc.tableID, sc.mutationID)
	}
	// Acquire lease.
	lease, err := sc.AcquireLease(ctx)
	if err != nil {
		return err
	}
	needRelease := true
	// Always try to release lease.
	defer func() {
		// If the schema changer deleted the descriptor, there's no longer a lease to be
		// released.
		if !needRelease {
			return
		}
		if err := sc.ReleaseLease(ctx, lease); err != nil {
			log.Warning(ctx, err)
		}
	}()

	notFirst, err := sc.notFirstInLine(ctx)
	if err != nil {
		return err
	}
	if notFirst {
		return errSchemaChangeNotFirstInLine
	}

	// Increment the version and unset tableDescriptor.UpVersion.
	desc, err := sc.MaybeIncrementVersion(ctx)
	if err != nil {
		return err
	}

	tableDesc := desc.GetTable()
	if tableDesc.HasDrainingNames() {
		if err := sc.drainNames(ctx, &lease); err != nil {
			return err
		}
	}

	if drop, err := sc.maybeAddDrop(ctx, inSession, &lease, tableDesc, evalCtx); err != nil {
		return err
	} else if drop {
		needRelease = false
		return nil
	}

	// Wait for the schema change to propagate to all nodes after this function
	// returns, so that the new schema is live everywhere. This is not needed for
	// correctness but is done to make the UI experience/tests predictable.
	defer func() {
		if err := sc.waitToUpdateLeases(ctx, sc.tableID); err != nil {
			log.Warning(ctx, err)
		}
	}()

	if sc.mutationID == sqlbase.InvalidMutationID {
		// Nothing more to do.
		return nil
	}

	// Find our job.
	foundJobID := false
	for _, g := range tableDesc.MutationJobs {
		if g.MutationID == sc.mutationID {
			job, err := sc.jobRegistry.LoadJob(ctx, g.JobID)
			if err != nil {
				return err
			}
			sc.job = job
			foundJobID = true
			break
		}
	}
	if !foundJobID {
		// No job means we've already run and completed this schema change
		// successfully, so we can just exit.
		return nil
	}

	if err := sc.job.Started(ctx); err != nil {
		if log.V(2) {
			log.Infof(ctx, "Failed to mark job %d as started: %v", *sc.job.ID(), err)
		}
	}

	// Another transaction might set the up_version bit again,
	// but we're no longer responsible for taking care of that.

	// Run through mutation state machine and backfill.
	err = sc.runStateMachineAndBackfill(ctx, &lease, evalCtx)

	// Purge the mutations if the application of the mutations failed due to
	// a permanent error. All other errors are transient errors that are
	// resolved by retrying the backfill.
	if isPermanentSchemaChangeError(err) {
		if err := sc.rollbackSchemaChange(ctx, err, &lease, evalCtx); err != nil {
			return err
		}
	}

	return err
}

func (sc *SchemaChanger) rollbackSchemaChange(
	ctx context.Context,
	err error,
	lease *sqlbase.TableDescriptor_SchemaChangeLease,
	evalCtx *extendedEvalContext,
) error {
	log.Warningf(ctx, "reversing schema change %d due to irrecoverable error: %s", *sc.job.ID(), err)
	if errReverse := sc.reverseMutations(ctx, err); errReverse != nil {
		// Although the backfill did hit an integrity constraint violation
		// and made a decision to reverse the mutations,
		// reverseMutations() failed. If exec() is called again the entire
		// schema change will be retried.
		return errReverse
	}

	// After this point the schema change has been reversed and any retry
	// of the schema change will act upon the reversed schema change.
	if errPurge := sc.runStateMachineAndBackfill(ctx, lease, evalCtx); errPurge != nil {
		// Don't return this error because we do want the caller to know
		// that an integrity constraint was violated with the original
		// schema change. The reversed schema change will be
		// retried via the async schema change manager.
		log.Warningf(ctx, "error purging mutation: %s, after error: %s", errPurge, err)
	}
	return nil
}

// MaybeIncrementVersion increments the version if needed.
// If the version is to be incremented, it also assures that all nodes are on
// the current (pre-increment) version of the descriptor.
// Returns the (potentially updated) descriptor.
func (sc *SchemaChanger) MaybeIncrementVersion(ctx context.Context) (*sqlbase.Descriptor, error) {
	return sc.leaseMgr.Publish(ctx, sc.tableID, func(desc *sqlbase.TableDescriptor) error {
		if !desc.UpVersion {
			// Return error so that Publish() doesn't increment the version.
			return errDidntUpdateDescriptor
		}
		desc.UpVersion = false
		// Publish() will increment the version.
		return nil
	}, nil)
}

// RunStateMachineBeforeBackfill moves the state machine forward
// and wait to ensure that all nodes are seeing the latest version
// of the table.
func (sc *SchemaChanger) RunStateMachineBeforeBackfill(ctx context.Context) error {
	if _, err := sc.leaseMgr.Publish(ctx, sc.tableID, func(desc *sqlbase.TableDescriptor) error {
		var modified bool
		// Apply mutations belonging to the same version.
		for i, mutation := range desc.Mutations {
			if mutation.MutationID != sc.mutationID {
				// Mutations are applied in a FIFO order. Only apply the first set of
				// mutations if they have the mutation ID we're looking for.
				break
			}
			switch mutation.Direction {
			case sqlbase.DescriptorMutation_ADD:
				switch mutation.State {
				case sqlbase.DescriptorMutation_DELETE_ONLY:
					// TODO(vivek): while moving up the state is appropriate,
					// it will be better to run the backfill of a unique index
					// twice: once in the DELETE_ONLY state to confirm that
					// the index can indeed be created, and subsequently in the
					// DELETE_AND_WRITE_ONLY state to fill in the missing elements of the
					// index (INSERT and UPDATE that happened in the interim).
					desc.Mutations[i].State = sqlbase.DescriptorMutation_DELETE_AND_WRITE_ONLY
					modified = true

				case sqlbase.DescriptorMutation_DELETE_AND_WRITE_ONLY:
					// The state change has already moved forward.
				}

			case sqlbase.DescriptorMutation_DROP:
				switch mutation.State {
				case sqlbase.DescriptorMutation_DELETE_ONLY:
					// The state change has already moved forward.

				case sqlbase.DescriptorMutation_DELETE_AND_WRITE_ONLY:
					desc.Mutations[i].State = sqlbase.DescriptorMutation_DELETE_ONLY
					modified = true
				}
			}
		}
		if !modified {
			// Return error so that Publish() doesn't increment the version.
			return errDidntUpdateDescriptor
		}
		return nil
	}, nil); err != nil {
		return err
	}
	// wait for the state change to propagate to all leases.
	return sc.waitToUpdateLeases(ctx, sc.tableID)
}

// Wait until the entire cluster has been updated to the latest version
// of the table descriptor.
func (sc *SchemaChanger) waitToUpdateLeases(ctx context.Context, tableID sqlbase.ID) error {
	// Aggressively retry because there might be a user waiting for the
	// schema change to complete.
	retryOpts := retry.Options{
		InitialBackoff: 20 * time.Millisecond,
		MaxBackoff:     200 * time.Millisecond,
		Multiplier:     2,
	}
	if log.V(2) {
		log.Infof(ctx, "waiting for a single version of table %d...", tableID)
	}
	_, err := sc.leaseMgr.WaitForOneVersion(ctx, tableID, retryOpts)
	if log.V(2) {
		log.Infof(ctx, "waiting for a single version of table %d... done", tableID)
	}
	return err
}

// done finalizes the mutations (adds new cols/indexes to the table).
// It ensures that all nodes are on the current (pre-update) version of the
// schema.
// Returns the updated of the descriptor.
func (sc *SchemaChanger) done(ctx context.Context) (*sqlbase.Descriptor, error) {
	isRollback := false
	return sc.leaseMgr.Publish(ctx, sc.tableID, func(desc *sqlbase.TableDescriptor) error {
		i := 0
		for _, mutation := range desc.Mutations {
			if mutation.MutationID != sc.mutationID {
				// Mutations are applied in a FIFO order. Only apply the first set of
				// mutations if they have the mutation ID we're looking for.
				break
			}
			isRollback = mutation.Rollback
			if err := desc.MakeMutationComplete(mutation); err != nil {
				return err
			}
			i++
		}
		if i == 0 {
			// The table descriptor is unchanged. Don't let Publish() increment
			// the version.
			return errDidntUpdateDescriptor
		}
		// Trim the executed mutations from the descriptor.
		desc.Mutations = desc.Mutations[i:]

		for i, g := range desc.MutationJobs {
			if g.MutationID == sc.mutationID {
				// Trim the executed mutation group from the descriptor.
				desc.MutationJobs = append(desc.MutationJobs[:i], desc.MutationJobs[i+1:]...)
				break
			}
		}
		return nil
	}, func(txn *client.Txn) error {
		if err := sc.job.WithTxn(txn).Succeeded(ctx, jobs.NoopFn); err != nil {
			return errors.Wrapf(err, "failed to mark job %d as as successful", *sc.job.ID())
		}

		schemaChangeEventType := EventLogFinishSchemaChange
		if isRollback {
			schemaChangeEventType = EventLogFinishSchemaRollback
		}

		// Log "Finish Schema Change" or "Finish Schema Change Rollback"
		// event. Only the table ID and mutation ID are logged; this can
		// be correlated with the DDL statement that initiated the change
		// using the mutation id.
		return MakeEventLogger(sc.execCfg).InsertEventRecord(
			ctx,
			txn,
			schemaChangeEventType,
			int32(sc.tableID),
			int32(sc.nodeID),
			struct {
				MutationID uint32
			}{uint32(sc.mutationID)},
		)
	})
}

// notFirstInLine returns true whenever the schema change has been queued
// up for execution after another schema change.
func (sc *SchemaChanger) notFirstInLine(ctx context.Context) (bool, error) {
	var notFirst bool
	err := sc.db.Txn(ctx, func(ctx context.Context, txn *client.Txn) error {
		notFirst = false
		desc, err := sqlbase.GetTableDescFromID(ctx, txn, sc.tableID)
		if err != nil {
			return err
		}
		for i, mutation := range desc.Mutations {
			if mutation.MutationID == sc.mutationID {
				notFirst = i != 0
				break
			}
		}
		return nil
	})
	return notFirst, err
}

// runStateMachineAndBackfill runs the schema change state machine followed by
// the backfill.
func (sc *SchemaChanger) runStateMachineAndBackfill(
	ctx context.Context,
	lease *sqlbase.TableDescriptor_SchemaChangeLease,
	evalCtx *extendedEvalContext,
) error {
	if fn := sc.testingKnobs.RunBeforePublishWriteAndDelete; fn != nil {
		fn()
	}
	// Run through mutation state machine before backfill.
	if err := sc.RunStateMachineBeforeBackfill(ctx); err != nil {
		return err
	}

	// Run backfill(s).
	if err := sc.runBackfill(ctx, lease, evalCtx); err != nil {
		return err
	}

	// Mark the mutations as completed.
	_, err := sc.done(ctx)
	return err
}

// reverseMutations reverses the direction of all the mutations with the
// mutationID. This is called after hitting an irrecoverable error while
// applying a schema change. If a column being added is reversed and droped,
// all new indexes referencing the column will also be dropped.
func (sc *SchemaChanger) reverseMutations(ctx context.Context, causingError error) error {
	// Reverse the flow of the state machine.
	var scJob *jobs.Job
	// All the mutations dropped by the reversal of the schema change.
	// This is created by traversing the mutations list like a graph
	// where the indexes refer columns. Whenever a column schema change
	// is reversed, any index mutation referencing it is also reversed.
	var droppedMutations map[sqlbase.MutationID]struct{}
	_, err := sc.leaseMgr.Publish(ctx, sc.tableID, func(desc *sqlbase.TableDescriptor) error {
		// Keep track of the column mutations being reversed so that indexes
		// referencing them can be dropped.
		columns := make(map[string]struct{})
		droppedMutations = nil

		for i, mutation := range desc.Mutations {
			if mutation.MutationID != sc.mutationID {
				// Only reverse the first set of mutations if they have the
				// mutation ID we're looking for.
				if i == 0 {
					return errDidntUpdateDescriptor
				}
				break
			}

			if mutation.Rollback {
				// Can actually never happen. This prevents a rollback of
				// an already rolled back mutation.
				return errors.Errorf("mutation already rolled back: %v", mutation)
			}

			log.Warningf(ctx, "reverse schema change mutation: %+v", mutation)
			desc.Mutations[i], columns = reverseMutation(mutation, false /*notStarted*/, columns)

			desc.Mutations[i].Rollback = true
		}

		// Delete all mutations that reference any of the reversed columns
		// by running a graph traversal of the mutations.
		if len(columns) > 0 {
			var err error
			droppedMutations, err = sc.deleteIndexMutationsWithReversedColumns(ctx, desc, columns)
			if err != nil {
				return err
			}
		}

		// Publish() will increment the version.
		return nil
	}, func(txn *client.Txn) error {
		// Read the table descriptor from the store. The Version of the
		// descriptor has already been incremented in the transaction and
		// this descriptor can be modified without incrementing the version.
		tableDesc, err := sqlbase.GetTableDescFromID(ctx, txn, sc.tableID)
		if err != nil {
			return err
		}

		// Mark the schema change job as failed and create a rollback job.
		scJob, err = sc.createRollbackJob(ctx, txn, tableDesc, causingError)
		if err != nil {
			return err
		}

		// Mark other reversed mutation jobs as failed.
		for m := range droppedMutations {
			_, err := markJobFailed(ctx, txn, tableDesc, m, sc.jobRegistry, causingError)
			if err != nil {
				return err
			}
		}

		// Log "Reverse Schema Change" event. Only the causing error and the
		// mutation ID are logged; this can be correlated with the DDL statement
		// that initiated the change using the mutation id.
		return MakeEventLogger(sc.execCfg).InsertEventRecord(
			ctx,
			txn,
			EventLogReverseSchemaChange,
			int32(sc.tableID),
			int32(sc.nodeID),
			struct {
				Error      string
				MutationID uint32
			}{fmt.Sprintf("%+v", causingError), uint32(sc.mutationID)},
		)
	})
	if err != nil {
		return err
	}
	// Only update the job if the transaction has succeeded. The schame change
	// job will now references the rollback job.
	if scJob != nil {
		sc.job = scJob
		return scJob.Started(ctx)
	}
	return nil
}

// Mark the job associated with the mutation as failed.
func markJobFailed(
	ctx context.Context,
	txn *client.Txn,
	tableDesc *sqlbase.TableDescriptor,
	mutationID sqlbase.MutationID,
	jobRegistry *jobs.Registry,
	causingError error,
) (*jobs.Job, error) {
	// Mark job as failed.
	jobID, err := getJobIDForMutationWithDescriptor(ctx, tableDesc, mutationID)
	if err != nil {
		return nil, err
	}
	job, err := jobRegistry.LoadJobWithTxn(ctx, jobID, txn)
	if err != nil {
		return nil, err
	}
	err = job.WithTxn(txn).Failed(ctx, causingError, jobs.NoopFn)
	return job, err
}

// Mark the current schema change job as failed and create a new rollback job
// representing the schema change and return it.
func (sc *SchemaChanger) createRollbackJob(
	ctx context.Context, txn *client.Txn, tableDesc *sqlbase.TableDescriptor, causingError error,
) (*jobs.Job, error) {

	// Mark job as failed.
	job, err := markJobFailed(ctx, txn, tableDesc, sc.mutationID, sc.jobRegistry, causingError)
	if err != nil {
		return nil, err
	}

	// Create a new rollback job representing the reversal of the mutations.
	for i := range tableDesc.MutationJobs {
		if tableDesc.MutationJobs[i].MutationID == sc.mutationID {
			// Create a roll back job.
			//
			// Initialize refresh spans to scan the entire table.
			span := tableDesc.PrimaryIndexSpan()
			var spanList []jobspb.ResumeSpanList
			for _, m := range tableDesc.Mutations {
				if m.MutationID == sc.mutationID {
					spanList = append(spanList,
						jobspb.ResumeSpanList{
							ResumeSpans: []roachpb.Span{span},
						},
					)
				}
			}
			payload := job.Payload()
			rollbackJob := sc.jobRegistry.NewJob(jobs.Record{
				Description:   fmt.Sprintf("ROLL BACK JOB %d: %s", *job.ID(), payload.Description),
				Username:      payload.Username,
				DescriptorIDs: payload.DescriptorIDs,
				Details:       jobspb.SchemaChangeDetails{ResumeSpanList: spanList},
				Progress:      jobspb.SchemaChangeProgress{},
			})
			if err := rollbackJob.WithTxn(txn).Created(ctx); err != nil {
				return nil, err
			}
			// Set the transaction back to nil so that this job can
			// be used in other transactions.
			rollbackJob.WithTxn(nil)

			tableDesc.MutationJobs[i].JobID = *rollbackJob.ID()

			// write descriptor, the version has already been incremented.
			descKey := sqlbase.MakeDescMetadataKey(tableDesc.GetID())
			descVal := sqlbase.WrapDescriptor(tableDesc)
			b := txn.NewBatch()
			b.Put(descKey, descVal)
			if err := txn.Run(ctx, b); err != nil {
				return nil, err
			}
			return rollbackJob, nil
		}
	}
	// Cannot get here.
	return nil, fmt.Errorf("no job found for table %d mutation %d", sc.tableID, sc.mutationID)
}

// deleteIndexMutationsWithReversedColumns deletes mutations with a
// different mutationID than the schema changer and with an index that
// references one of the reversed columns. Execute this as a breadth
// first search graph traversal.
func (sc *SchemaChanger) deleteIndexMutationsWithReversedColumns(
	ctx context.Context, desc *sqlbase.TableDescriptor, columns map[string]struct{},
) (map[sqlbase.MutationID]struct{}, error) {
	dropMutations := make(map[sqlbase.MutationID]struct{})
	// Run breadth first search traversal that reverses mutations
	for {
		start := len(dropMutations)
		for _, mutation := range desc.Mutations {
			if mutation.MutationID != sc.mutationID {
				if idx := mutation.GetIndex(); idx != nil {
					for _, name := range idx.ColumnNames {
						if _, ok := columns[name]; ok {
							// Such an index mutation has to be with direction ADD and
							// in the DELETE_ONLY state. Live indexes referencing live
							// columns cannot be deleted and thus never have direction
							// DROP. All mutations with the ADD direction start off in
							// the DELETE_ONLY state.
							if mutation.Direction != sqlbase.DescriptorMutation_ADD ||
								mutation.State != sqlbase.DescriptorMutation_DELETE_ONLY {
								panic(fmt.Sprintf("mutation in bad state: %+v", mutation))
							}
							log.Warningf(ctx, "drop schema change mutation: %+v", mutation)
							dropMutations[mutation.MutationID] = struct{}{}
							break
						}
					}
				}
			}
		}

		if len(dropMutations) == start {
			// No more mutations to drop.
			break
		}
		// Drop mutations.
		newMutations := make([]sqlbase.DescriptorMutation, 0, len(desc.Mutations))
		for _, mutation := range desc.Mutations {
			if _, ok := dropMutations[mutation.MutationID]; ok {
				// Reverse mutation. Update columns to reflect additional
				// columns that have been purged. This mutation doesn't need
				// a rollback because it was not started.
				mutation, columns = reverseMutation(mutation, true /*notStarted*/, columns)
				// Mark as complete because this mutation needs no backfill.
				if err := desc.MakeMutationComplete(mutation); err != nil {
					return nil, err
				}
			} else {
				newMutations = append(newMutations, mutation)
			}
		}
		// Reset mutations.
		desc.Mutations = newMutations
	}
	return dropMutations, nil
}

// Reverse a mutation. Returns the updated mutation and updated columns.
// notStarted is set to true only if the schema change state machine
// was not started for the mutation.
func reverseMutation(
	mutation sqlbase.DescriptorMutation, notStarted bool, columns map[string]struct{},
) (sqlbase.DescriptorMutation, map[string]struct{}) {
	switch mutation.Direction {
	case sqlbase.DescriptorMutation_ADD:
		mutation.Direction = sqlbase.DescriptorMutation_DROP
		// A column ADD being reversed gets placed in the map.
		if col := mutation.GetColumn(); col != nil {
			columns[col.Name] = struct{}{}
		}
		if notStarted && mutation.State != sqlbase.DescriptorMutation_DELETE_ONLY {
			panic(fmt.Sprintf("mutation in bad state: %+v", mutation))
		}

	case sqlbase.DescriptorMutation_DROP:
		mutation.Direction = sqlbase.DescriptorMutation_ADD
		if notStarted && mutation.State != sqlbase.DescriptorMutation_DELETE_AND_WRITE_ONLY {
			panic(fmt.Sprintf("mutation in bad state: %+v", mutation))
		}
	}
	return mutation, columns
}

// TestingSchemaChangerCollection is an exported (for testing) version of
// schemaChangerCollection.
// TODO(andrei): get rid of this type once we can have tests internal to the sql
// package (as of April 2016 we can't because sql can't import server).
type TestingSchemaChangerCollection struct {
	scc *schemaChangerCollection
}

// ClearSchemaChangers clears the schema changers from the collection.
// If this is called from a SyncSchemaChangersFilter, no schema changer will be
// run.
func (tscc TestingSchemaChangerCollection) ClearSchemaChangers() {
	tscc.scc.schemaChangers = tscc.scc.schemaChangers[:0]
}

// SyncSchemaChangersFilter is the type of a hook to be installed through the
// ExecutorContext for blocking or otherwise manipulating schema changers run
// through the sync schema changers path.
type SyncSchemaChangersFilter func(TestingSchemaChangerCollection)

// SchemaChangerTestingKnobs for testing the schema change execution path
// through both the synchronous and asynchronous paths.
type SchemaChangerTestingKnobs struct {
	// SyncFilter is called before running schema changers synchronously (at
	// the end of a txn). The function can be used to clear the schema
	// changers (if the test doesn't want them run using the synchronous path)
	// or to temporarily block execution. Note that this has nothing to do
	// with the async path for running schema changers. To block that, set
	// AsyncExecNotification.
	SyncFilter SyncSchemaChangersFilter

	// RunBeforePublishWriteAndDelete is called just before publishing the
	// write+delete state for the schema change.
	RunBeforePublishWriteAndDelete func()

	// RunBeforeBackfill is called just before starting the backfill.
	RunBeforeBackfill func() error

	// RunBeforeBackfill is called just before starting the index backfill, after
	// fixing the index backfill scan timestamp.
	RunBeforeIndexBackfill func()

	// OldNamesDrainedNotification is called during a schema change,
	// after all leases on the version of the descriptor with the old
	// names are gone, and just before the mapping of the old names to the
	// descriptor id are about to be deleted.
	OldNamesDrainedNotification func()

	// AsyncExecNotification is a function called before running a schema
	// change asynchronously. Returning an error will prevent the asynchronous
	// execution path from running.
	AsyncExecNotification func() error

	// AsyncExecQuickly executes queued schema changes as soon as possible.
	AsyncExecQuickly bool

	// WriteCheckpointInterval is the interval after which a checkpoint is
	// written.
	WriteCheckpointInterval time.Duration

	// BackfillChunkSize is to be used for all backfill chunked operations.
	BackfillChunkSize int64

	// TwoVersionLeaseViolation is called whenever a schema change
	// transaction is unable to commit because it is violating the two
	// version lease invariant.
	TwoVersionLeaseViolation func()

	// OnError is called with all the errors seen by the
	// synchronous code path.
	OnError func(err error)
}

// ModuleTestingKnobs is part of the base.ModuleTestingKnobs interface.
func (*SchemaChangerTestingKnobs) ModuleTestingKnobs() {}

// SchemaChangeManager processes pending schema changes seen in gossip
// updates. Most schema changes are executed synchronously by the node
// that created the schema change. If the node dies while
// processing the schema change this manager acts as a backup
// execution mechanism.
type SchemaChangeManager struct {
	ambientCtx   log.AmbientContext
	execCfg      *ExecutorConfig
	testingKnobs *SchemaChangerTestingKnobs
	// Create a schema changer for every outstanding schema change seen.
	schemaChangers map[sqlbase.ID]SchemaChanger
	// Create a schema changer for every dropped table that needs to be GC-ed.
	forGC          map[sqlbase.ID]SchemaChanger
	distSQLPlanner *DistSQLPlanner
}

// NewSchemaChangeManager returns a new SchemaChangeManager.
func NewSchemaChangeManager(
	ambientCtx log.AmbientContext,
	execCfg *ExecutorConfig,
	testingKnobs *SchemaChangerTestingKnobs,
	db client.DB,
	nodeDesc roachpb.NodeDescriptor,
	dsp *DistSQLPlanner,
) *SchemaChangeManager {
	return &SchemaChangeManager{
		ambientCtx:     ambientCtx,
		execCfg:        execCfg,
		testingKnobs:   testingKnobs,
		schemaChangers: make(map[sqlbase.ID]SchemaChanger),
		forGC:          make(map[sqlbase.ID]SchemaChanger),
		distSQLPlanner: dsp,
	}
}

// Creates a timer that is used by the manager to decide on
// when to run the next schema changer.
func (s *SchemaChangeManager) newTimer(changers map[sqlbase.ID]SchemaChanger) *time.Timer {
	if len(changers) == 0 {
		return &time.Timer{}
	}
	waitDuration := time.Duration(math.MaxInt64)
	now := timeutil.Now()
	for _, sc := range changers {
		d := sc.execAfter.Sub(now)
		if d < waitDuration {
			waitDuration = d
		}
	}
	return time.NewTimer(waitDuration)
}

// Start starts a goroutine that runs outstanding schema changes
// for tables received in the latest system configuration via gossip.
func (s *SchemaChangeManager) Start(stopper *stop.Stopper) {
	stopper.RunWorker(s.ambientCtx.AnnotateCtx(context.Background()), func(ctx context.Context) {
		descKeyPrefix := keys.MakeTablePrefix(uint32(sqlbase.DescriptorTable.ID))
		cfgFilter := gossip.MakeSystemConfigDeltaFilter(descKeyPrefix)
		k := keys.MakeTablePrefix(uint32(keys.ZonesTableID))
		k = encoding.EncodeUvarintAscending(k, uint64(keys.ZonesTablePrimaryIndexID))
		zoneCfgFilter := gossip.MakeSystemConfigDeltaFilter(k)
		gossipUpdateC := s.execCfg.Gossip.RegisterSystemConfigChannel()
		timer := &time.Timer{}
		gcTimer := &time.Timer{}
		// A jitter is added to reduce contention between nodes
		// attempting to run the schema change.
		delay := time.Duration(float64(asyncSchemaChangeDelay) * (0.9 + 0.2*rand.Float64()))
		if s.testingKnobs.AsyncExecQuickly {
			delay = 20 * time.Millisecond
		}

		execOneSchemaChange := func(schemaChangers map[sqlbase.ID]SchemaChanger) {
			for tableID, sc := range schemaChangers {
				if timeutil.Since(sc.execAfter) > 0 {
					evalCtx := createSchemaChangeEvalCtx(s.execCfg.Clock.Now(), &SessionTracing{})

					execCtx, cleanup := tracing.EnsureContext(ctx, s.ambientCtx.Tracer, "schema change [async]")
					err := sc.exec(execCtx, false /* inSession */, &evalCtx)
					cleanup()

					// Advance the execAfter time so that this schema
					// changer doesn't get called again for a while.
					sc.execAfter = timeutil.Now().Add(delay)
					schemaChangers[tableID] = sc

					if err != nil {
						if shouldLogSchemaChangeError(err) {
							log.Warningf(ctx, "Error executing schema change: %s", err)
						}
						if err == sqlbase.ErrDescriptorNotFound {
							// Someone deleted this table. Don't try to run the schema
							// changer again. Note that there's no gossip update for the
							// deletion which would remove this schemaChanger.
							delete(schemaChangers, tableID)
						}
					} else {
						// We successfully executed the schema change. Delete it.
						delete(schemaChangers, tableID)
					}

					// Only attempt to run one schema changer.
					break
				}
			}
		}

		for {
			select {
			case <-gossipUpdateC:
				cfg, _ := s.execCfg.Gossip.GetSystemConfig()
				// Read all tables and their versions
				if log.V(2) {
					log.Info(ctx, "received a new config")
				}

				resetTimer := false
				// Check to see if the zone cfg has been modified.
				zoneCfgModified := false
				zoneCfgFilter.ForModified(cfg, func(kv roachpb.KeyValue) {
					zoneCfgModified = true
				})
				if zoneCfgModified {
					// Check to see if the GC TTL has changed for all the
					// tables that are currently waiting to be GC-ed. If the
					// GC TTL for a table has indeed changed it is modified
					// and enqueued with the new TTL timeout.
					for id, sc := range s.forGC {
						if sc.dropTime > 0 {
							zoneCfg, _, err := ZoneConfigHook(cfg, uint32(id), nil)
							if err != nil {
								log.Errorf(ctx, "no zone config for desc: %d", id)
								return
							}
							deadline := sc.dropTime +
								int64(zoneCfg.GC.TTLSeconds)*time.Second.Nanoseconds() +
								int64(delay)
							if ea := timeutil.Unix(0, deadline); ea != sc.execAfter {
								resetTimer = true
								sc.execAfter = ea
								// Safe to modify map inplace while iterating over it.
								s.forGC[id] = sc
								if log.V(2) {
									log.Infof(ctx,
										"re-queue up pending drop table GC; table: %d", id)
								}
							}
						}
					}
				}

				schemaChanger := SchemaChanger{
					execCfg:              s.execCfg,
					nodeID:               s.execCfg.NodeID.Get(),
					db:                   s.execCfg.DB,
					leaseMgr:             s.execCfg.LeaseManager,
					testingKnobs:         s.testingKnobs,
					distSQLPlanner:       s.distSQLPlanner,
					jobRegistry:          s.execCfg.JobRegistry,
					leaseHolderCache:     s.execCfg.LeaseHolderCache,
					rangeDescriptorCache: s.execCfg.RangeDescriptorCache,
					clock:                s.execCfg.Clock,
					settings:             s.execCfg.Settings,
				}

				execAfter := timeutil.Now().Add(delay)
				cfgFilter.ForModified(cfg, func(kv roachpb.KeyValue) {
					resetTimer = true
					// Attempt to unmarshal config into a table/database descriptor.
					var descriptor sqlbase.Descriptor
					if err := kv.Value.GetProto(&descriptor); err != nil {
						log.Warningf(ctx, "%s: unable to unmarshal descriptor %v", kv.Key, kv.Value)
						return
					}
					switch union := descriptor.Union.(type) {
					case *sqlbase.Descriptor_Table:
						table := union.Table
						table.MaybeFillInDescriptor()
						if err := table.ValidateTable(s.execCfg.Settings); err != nil {
							log.Errorf(ctx, "%s: received invalid table descriptor: %s. Desc: %v",
								kv.Key, err, table,
							)
							return
						}

						schemaChanger.tableID = table.ID
						schemaChanger.mutationID = sqlbase.InvalidMutationID
						schemaChanger.execAfter = execAfter
						schemaChanger.dropTime = 0

						// Keep track of outstanding schema changes.
						// If all schema change commands always set UpVersion, why
						// check for the presence of mutations?
						// A schema change execution might fail soon after
						// unsetting UpVersion, and we still want to process
						// outstanding mutations.
						if table.UpVersion || table.Adding() ||
							table.HasDrainingNames() || len(table.Mutations) > 0 {
							if log.V(2) {
								log.Infof(ctx, "%s: queue up pending schema change; table: %d, version: %d",
									kv.Key, table.ID, table.Version)
							}

							if len(table.Mutations) > 0 {
								schemaChanger.mutationID = table.Mutations[0].MutationID
							}
							s.schemaChangers[table.ID] = schemaChanger
						} else if table.Dropped() {
							// If the table is dropped add table to map forGC.
							if log.V(2) {
								log.Infof(ctx,
									"%s: queue up pending drop table GC; table: %d, version: %d",
									kv.Key, table.ID, table.Version)
							}

							if table.DropTime > 0 {
								schemaChanger.dropTime = table.DropTime
								zoneCfg, _, err := ZoneConfigHook(cfg, uint32(table.ID), nil)
								if err != nil {
									log.Errorf(ctx, "no zone config for desc: %d", table.ID)
									return
								}
								deadline := table.DropTime +
									int64(zoneCfg.GC.TTLSeconds)*time.Second.Nanoseconds() +
									int64(delay)
								schemaChanger.execAfter = timeutil.Unix(0, deadline)
							}

							s.forGC[table.ID] = schemaChanger
							// Remove from schema change map if present because
							// this table has been dropped and is only waiting
							// to be GC-ed.
							delete(s.schemaChangers, table.ID)
						}

					case *sqlbase.Descriptor_Database:
						// Ignore.
					}
				})

				if resetTimer {
					timer = s.newTimer(s.schemaChangers)
					gcTimer = s.newTimer(s.forGC)
				}

			case <-timer.C:
				if s.testingKnobs.AsyncExecNotification != nil &&
					s.testingKnobs.AsyncExecNotification() != nil {
					timer = s.newTimer(s.schemaChangers)
					continue
				}

				execOneSchemaChange(s.schemaChangers)

				timer = s.newTimer(s.schemaChangers)

			case <-gcTimer.C:
				if s.testingKnobs.AsyncExecNotification != nil &&
					s.testingKnobs.AsyncExecNotification() != nil {
					gcTimer = s.newTimer(s.forGC)
					continue
				}

				execOneSchemaChange(s.forGC)

				gcTimer = s.newTimer(s.forGC)

			case <-stopper.ShouldStop():
				return
			}
		}
	})
}

// createSchemaChangeEvalCtx creates an extendedEvalContext() to be used for backfills.
//
// TODO(andrei): This EvalContext() will be broken for backfills trying to use
// functions marked with distsqlBlacklist.
func createSchemaChangeEvalCtx(ts hlc.Timestamp, tracing *SessionTracing) extendedEvalContext {
	dummyLocation := time.UTC
	evalCtx := extendedEvalContext{
		Tracing: tracing,
		EvalContext: tree.EvalContext{
			SessionData: &sessiondata.SessionData{
				SearchPath: sqlbase.DefaultSearchPath,
				// The database is not supposed to be needed in schema changes, as there
				// shouldn't be unqualified identifiers in backfills, and the pure functions
				// that need it should have already been evaluated.
				//
				// TODO(andrei): find a way to assert that this field is indeed not used.
				// And in fact it is used by `current_schemas()`, which, although is a pure
				// function, takes arguments which might be impure (so it can't always be
				// pre-evaluated).
				Database:      "",
				SequenceState: sessiondata.NewSequenceState(),
				DataConversion: sessiondata.DataConversionConfig{
					Location: dummyLocation,
				},
			},
		},
	}
	// The backfill is going to use the current timestamp for the various
	// functions, like now(), that need it.  It's possible that the backfill has
	// been partially performed already by another SchemaChangeManager with
	// another timestamp.
	//
	// TODO(andrei): Figure out if this is what we want, and whether the
	// timestamp from the session that enqueued the schema change
	// is/should be used for impure functions like now().
	evalCtx.SetTxnTimestamp(timeutil.Unix(0 /* sec */, ts.WallTime))
	evalCtx.SetStmtTimestamp(timeutil.Unix(0 /* sec */, ts.WallTime))

	return evalCtx
}
