/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

#pragma ident	"%Z%%M%	%I%	%E% SMI"

#include <sys/zfs_context.h>
#include <sys/spa.h>
#include <sys/vdev_disk.h>
#include <sys/vdev_impl.h>
#include <sys/fs/zfs.h>
#include <sys/zio.h>
#include <sys/sunldi.h>
#include <sys/disklabel.h>
#include <sys/dkio.h>

/*
 * Virtual device vector for disks.
 */

static int
vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
{
	struct disklabel dklabel;
	struct partition dkpart;
	struct nameidata nd;
	struct partinfo pinfo;
	vdev_disk_t *dvd;
	vnode_t *vp;
	int error, cmd;

	/*
	 * We must have a pathname, and it must be absolute.
	 */
	if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') {
		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
		return (EINVAL);
	}

	dvd = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP);

	/*
	 * When opening a disk device, we want to preserve the user's original
	 * intent.  We always want to open the device by the path the user gave
	 * us, even if it is one of multiple paths to the save device.  But we
	 * also want to be able to survive disks being removed/recabled.
	 * Therefore the sequence of opening devices is:
	 *
	 * 1. Try opening the device by path.  For legacy pools without the
	 *    'whole_disk' property, attempt to fix the path by appending 's0'.
	 *
	 * 2. If the devid of the device matches the stored value, return
	 *    success.
	 *
	 * 3. Otherwise, the device may have moved.  Try opening the device
	 *    by the devid instead.
	 *
	 */
	if (vd->vdev_devid != NULL) {
		/* XXXNETBSD wedges */
	}

	error = EINVAL;		/* presume failure */

	NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, vd->vdev_path);
	error = vn_open(vd->vdev_path, UIO_SYSSPACE, FREAD|FWRITE, 0,
	    &vp, CRCREAT, 0);
	if (error != 0) {
		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
		return error;
	}
	if (vp->v_type != VBLK) {
		vput(vp);
		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
		return EINVAL;
	}

	/*
	 * XXXNETBSD Compare the devid to the stored value.
	 */

	/*
	 * Determine the actual size of the device.
	 * XXXNETBSD wedges.
	 */
	pinfo.disklab = &dklabel;
	pinfo.part = &dkpart;
	error = VOP_IOCTL(vp, DIOCGPART, &pinfo, FREAD|FWRITE,
	    kauth_cred_get());
	if (error != 0) {
		vput(vp);
		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
		return error;
	}
	*psize = (uint64_t)dkpart.p_size * dklabel.d_secsize;
	*ashift = highbit(MAX(dklabel.d_secsize, SPA_MINBLOCKSIZE)) - 1;
	vd->vdev_wholedisk = (dkpart.p_offset == 0);

	/*
	 * If we own the whole disk, try to enable disk write caching.
	 * We ignore errors because it's OK if we can't do it.
	 */
	cmd = DKCACHE_READ | DKCACHE_WRITE;
	error = VOP_IOCTL(vp, DIOCSCACHE, &cmd, FREAD|FWRITE,
	    kauth_cred_get());

	/*
	 * Clear the nowritecache bit, so that on a vdev_reopen() we will
	 * try again.
	 */
	vd->vdev_nowritecache = B_FALSE;

	VOP_UNLOCK(vp, 0);
	dvd->vd_lh = (ldi_handle_t)vp;

	return (0);
}

static void
vdev_disk_close(vdev_t *vd)
{
	vdev_disk_t *dvd = vd->vdev_tsd;
	vnode_t *vp;

	if (dvd == NULL)
		return;

	dprintf("removing disk %s, devid %s\n",
	    vd->vdev_path ? vd->vdev_path : "<none>",
	    vd->vdev_devid ? vd->vdev_devid : "<none>");

	if ((vp = (vnode_t *)dvd->vd_lh) != NULL) {
		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
		VOP_CLOSE(vp, FREAD|FWRITE, 1, 0, kauth_cred_get());
		vput(vp);
	}

	kmem_free(dvd, sizeof (vdev_disk_t));
	vd->vdev_tsd = NULL;
}

static void
vdev_disk_io_intr(buf_t *bp)
{
	zio_t *zio = bp->b_saveaddr;	/* XXX */

	if ((zio->io_error = bp->b_error) == 0 && bp->b_resid != 0)
		zio->io_error = EIO;

	putiobuf(bp);

	zio_next_stage_async(zio);
}

static void
vdev_disk_io_start(zio_t *zio)
{
	vdev_t *vd = zio->io_vd;
	vdev_disk_t *dvd = vd->vdev_tsd;
	vnode_t *vp = (vnode_t *)dvd->vd_lh;
	buf_t *bp;
	int flags, error, cmd;

	if (zio->io_type == ZIO_TYPE_IOCTL) {
		zio_vdev_io_bypass(zio);

		/* XXPOLICY */
		if (vdev_is_dead(vd)) {
			zio->io_error = ENXIO;
			zio_next_stage_async(zio);
			return;
		}

		switch (zio->io_cmd) {

		case DKIOCFLUSHWRITECACHE:

			if (zfs_nocacheflush)
				break;

			if (vd->vdev_nowritecache) {
				zio->io_error = ENOTSUP;
				break;
			}

			/*
			 * XXXNETBSD punt to a kthread.  It can be async.
			 */
			vn_lock(vp, LK_EXCLUSIVE|LK_RETRY);
			cmd = 1;
			error = VOP_IOCTL(vp, DIOCCACHESYNC, &cmd,
			    FREAD|FWRITE, kauth_cred_get());
			VOP_UNLOCK(vp, 0);

			if (error == ENOTTY || error == EINVAL) {
				/*
				 * If we get an error, we know that no future
				 * attempts will ever succeed.  In this case we
				 * set a persistent bit so that we don't bother
				 * with the ioctl in the future.
				 */
				vd->vdev_nowritecache = B_TRUE;
			}
			zio->io_error = error;
			zio_next_stage_async(zio);

			break;

		default:
			zio->io_error = ENOTSUP;
		}

		zio_next_stage_async(zio);
		return;
	}

	if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio) == 0)
		return;

	if ((zio = vdev_queue_io(zio)) == NULL)
		return;

	flags = (zio->io_type == ZIO_TYPE_READ ? B_READ : B_WRITE);
	flags |= B_BUSY | B_NOCACHE;

	bp = getiobuf();
	bp->b_flags |= flags;
	bp->b_bcount = zio->io_size;
	bp->b_data = zio->io_data;
	bp->b_blkno = zio->io_offset;
	bp->b_bcount = zio->io_size;
	bp->b_iodone = vdev_disk_io_intr;
	bp->b_vp = vp;

	/* XXPOLICY */
	error = vdev_is_dead(vd) ? ENXIO : vdev_error_inject(vd, zio);
	if (error) {
		zio->io_error = error;
		bp->b_error = error;
		bp->b_resid = bp->b_bcount;
		bp->b_iodone(bp);
		return;
	}

	error = VOP_STRATEGY(vp, bp);
	/* will return non-zero only on programming errors */
	ASSERT(error == 0);
}

static void
vdev_disk_io_done(zio_t *zio)
{
	vdev_queue_io_done(zio);

	if (zio->io_type == ZIO_TYPE_WRITE)
		vdev_cache_write(zio);

	if (zio_injection_enabled && zio->io_error == 0)
		zio->io_error = zio_handle_device_injection(zio->io_vd, EIO);

	zio_next_stage(zio);
}

vdev_ops_t vdev_disk_ops = {
	vdev_disk_open,
	vdev_disk_close,
	vdev_default_asize,
	vdev_disk_io_start,
	vdev_disk_io_done,
	NULL,
	VDEV_TYPE_DISK,		/* name of this vdev type */
	B_TRUE			/* leaf vdev */
};
