#!/usr/bin/perl -w

# usage:  format_mount.pl [diskID] swapsize_in_MB filesystem type [DEBUG|NODEBUG]
#  ex:    format_mount.pl sda 8000 xfs NODEBUG
#         above will
#         - silently create a 8GB swap partion on /dev/sda,
#         - add the rest as a /scratch partition formatted as XFS
#         - add the swap with swapon, mount the scratch partition
#         - write a verification file to /scratch so it can be ID'ed on reboot.

#  NOTE the diskID is the 'sda' part, not the full '/dev/sda'
#  NOTE if you reset the command for a different partitioning, but leave
# the validation file intact, this util will fail to re-partition.  It will
# find the validation file and assume everything's OK.

# for Perceus: need to set the VNFS /etc/fstab to include entries for
#     the expected swap and /scratch partitions that will be created on the
#     /dev/sdx device (usually /dev/sda, for a single disk per compute node)
#  format_mount.pl will use 'parted' to determine disk size and make a swap partition
# of the given size. The rest of the disk will be made into scratch and mkfs'ed with
# the filesystem type you selected at execution time.

# Once the variables are set (see === Options BEGIN === below), this does some
# primitive checking on the defined disk (it can be mounted or unmounted).
# - checks whether such a device is logged in dmesg,
# - checks whether it has already been formatted correctly
#     (checks for the existence of the validation file)
# - if it doesn't find the validation file it assumes that the disk needs formatting
# - then it partitions the disk in 2 partitions, a swap partition and a scratch filesystem
# - then it formats the scratch filesystem with ext3/ext4/xfs (xfs -> almost no time for
#     any sized format)
# - once it partitions and formats, it writes the validation file (so that
#     this doesn't have to be repeated), then swapon's the new swap and mounts the
#     new /scratch partition (see above Perceus note)

use strict;
use vars qw(
$disk $rawdev $swap_MB $disksize_MB $mount_part $device $swap_part $mountpoint $valid_file $ftype
$dfgrep $mounttest $part_line $N @L $mk_frst_part_cmd $mk_scnd_part_cmd $part_nbr
$i $tmp $stillmounted $dbg $dmesg_hint $disk_line $parted $parted_ver $n @l  $forceflag
$mkfs $who $cur_fstype @fstypes
);

$| = 1; # set flush for errors.
$dbg = 1;  # overridden on commandline.

# =================== Options BEGIN ====================
$swap_part = "1";
$mount_part = "2"; # number of partition to mount
$mountpoint = "/scratch";
$valid_file = $mountpoint . '/perceus_validation';
@fstypes = ("ext3", "ext4", "xfs");
# =================== Options END ====================

$who = `whoami`; chomp $who;
if ($who ne 'root') {die "ERROR: You're not ROOT!! (This is not going to end well...)";}
if ($#ARGV != 3){ # nbr of args = $#ARGV + 1
	die <<"USAGE";

FATAL: 'format_mount.pl' needs 3 args:
        - the device name (ie 'sdc')
        - the swap partition size in MB (rest of the disk will be scratch)
        - the filesystem type to be made (ext3, ext4, or xfs)
        - DEBUG [spews debug info] or NODEBUG [is quieter]

Example: sudo ./format_mount.pl sdb 300 xfs NODEBUG

USAGE

} else {
	$disk = $ARGV[0];
	$swap_MB = $ARGV[1];
	$ftype = $ARGV[2];
	if ($ARGV[3] eq "DEBUG") {$dbg = 2;}
	if ($ftype =~ /ext3/ || $ftype =~ /ext4/) {$forceflag = "-F";}
	elsif ($ftype =~ /xfs/)  {$forceflag = "-f";}
	else {die "Filesystem type must be one of 'ext3', 'ext4', or 'xfs'\n";};
	$mkfs = "mkfs." . $ftype;

	if ($disk !~ /sd/) {
		info($dbg,"WARN: diskID is not 'sdx' format.  You have 5s to stop this process.");
		for (my $i=1; $i<6;$i++){print "$i..";	sleep 1;}
	}

	$rawdev = "/dev/" . $disk; # like /dev/sdb
	print STDERR "INFO: 'format_mount.pl': beginning to repartition scratch disk: [$rawdev]\n";

	# test for /scratch and mkdir it if it doesn't exist.
	if (-e "/scratch") {
		if (! -d "/scratch") {
			info($dbg,"WARN: /scratch is not a dir; deleting and re-mk'ing it.");
			unlink "/scratch"; mkdir "/scratch";}
	} else {
		info($dbg,"WARN: No /scratch; mk'ing it.");
		mkdir "/scratch";
	}

# is there any such rawdev on the system?  grep thru dmesg to check for one.
# this will be a fresh boot, so dmesg should be a decent log of whether it's there.
# in debugging mode, where dmesg is long and there may have been thumbdrives plugged,
# it's not such a good test. But if it returns nothing, there's really nothing.
	$dmesg_hint = `dmesg | grep $disk`;
	if ($dmesg_hint !~ /$disk/) {
		die "\nHARDWARE ERROR: No such device [$rawdev] in dmesg; exiting!\n\n";
	}
	# test for parted before continuing!!
	$parted = `which parted`; chomp $parted;
	if ($parted !~ /\/sbin\/parted/) {
		die "ERROR:  no 'parted', so can't partition the disk!\n";
	} else { # figure out what version it is..
		$parted_ver = `parted --version | grep -i "gnu parted" `; chomp $parted_ver;
		$n = @l = split /\s+/, $parted_ver;
		$parted_ver = $l[$n-1];
		info($dbg,"INFO: [$parted] is version [$parted_ver].")
		#$tmp = <STDIN>;
	}
	$device = $rawdev . $mount_part;

	$disk_line = `parted --script $rawdev print |grep "^Disk"`;
	chomp $disk_line;
	#info($dbg,"INFO: rawdev = [$rawdev], disk_line = [$disk_line]");
	$N = @L = split /\s+/, $disk_line;
	if ($L[2] =~ /GB/) { # if measurement is in GB, convert to MB
		$disksize_MB = int(substr $L[2],0,-2) * 1000; # 1000 is more realistic than 1024
		info($dbg,"INFO: Disk size [$L[2]] conv to MB: [$disksize_MB]");
	} else {
		$disksize_MB = int(substr $L[2],0,-2);
		#print "disksize = $disksize_MB\n";
	}
}
# check to see if the device is already mounted (for debugging script)
# if it is, umount it.  Note that an automounted disk might mount all partitions
# so you really have to check for and umount all partitions, not just the one you're interested in.
# so should cycle thru a 'df' listing until there are no $rawdev entries left.
info($dbg,"INFO: Checking if any [$rawdev] partitions are mounted");
$dfgrep = `df -h |grep $rawdev`;
if ($dfgrep =~  /$rawdev/) { # then the partition is mounted from a previous run
	$stillmounted = 1;
	while ($stillmounted) {
		$N = @L = split(/\s+/,$dfgrep);
		info($dbg,"INFO: unmounting $L[0]");
		system "umount $L[0]";
		sleep 1;
		info($dbg,"INFO: [$L[0]] should have unmounted.");
		$dfgrep = `df -h |grep $rawdev`;
		if ($dfgrep !~  /$rawdev/){$stillmounted = 0;}
	}
} else {
	info($dbg,"INFO: Good: Looks like [$device] isn't mounted.  Continuing...");
}

# if it's a blank disk or it doesn't have the expected partition, the next part will fail
# so have to check to see if it does have the expected partition.  If not, skip the validation file.
$part_line = `parted --script $rawdev print | grep -A9 '^Number'  |tail -2 |head -1`;

$N = @L = split (/\s+/, $part_line);
# for some reason, parted can output the filesystem type in position [5] or [6],
# so have to check both for ext3, ext4, xfs.
undef $cur_fstype;
for (my $i=5;$i<7;$i++) {
	for (my $r=0;$r<3;$r++) {
		#info($dbg,"$L[$i] = $fstypes[$r]?");
		if ($L[$i] eq $fstypes[$r]) {$cur_fstype = $fstypes[$r]; last;}
	}
}

if (defined $cur_fstype) { # then the partition is not formatted and we can skip all this
	info($dbg,"INFO: Current filetype of disk is [$cur_fstype]");


	if ($L[1] eq '2'){ # '2' is the largest partition #; if '2', disk might be set up
		info($dbg,"INFO: Now trying to mount unmounted [$device]...");
		$mounttest = system("mount -t $cur_fstype $device $mountpoint");
		info($dbg,"INFO: sleeping for 1 sec to allow mount to settle...");
		sleep 1;

		if ($mounttest != 0){ # parted reads ext4 as ext3, so also try ext4
			if ($cur_fstype =~ /ext3/) {
				info($dbg,"INFO: parted reports ext3, but doesn't mount; trying ext4.");
				$mounttest = system("mount -t ext4 $device $mountpoint");
				sleep 1;
			}
		}
		if ($mounttest == 0) {
			info($dbg,"INFO: Device [$device] mounted!");
			if (-f "$valid_file") {
				info($dbg,"INFO: Validation file [$valid_file] exists!");
				#system "umount $device";
				info($dbg,"\nANNOUNCE: That's it folks!  The disk is fine!");
				goto FINE;
			} else {
				info($dbg,"INFO: Validation file [$valid_file] not present");
				# can mount a disk with the right partition and filetype but NOT the validation file, so it still needs to be formatted
				info($dbg,"INFO: Need to unmount [$device].  Trying..");
				system "umount $device"; # try to unmount it
			}
		} else {
			# so the device is there but it either doesn't have the right fstype
			# or the right partition
			die "ERROR: Can't mount [$device] on [$mountpoint]. Possible corruption danger.\nCheck disk manually.\n"
			# in either case, have to re-format
		}
	} else {
		info($dbg,"INFO: Disk can't have been set up correctly; so no validation file possible.");
	}
}
# Either the disk is unformatted (or has been blanked)
# or it has existing partitions that need to be removed.
# The latter will usually be the case, but the former may happen.

# 1st, detect them (same as above; ok for both 1.7.1 & 1.8.8)
# if get this: [Unable to open /dev/sda - unrecognised disk label.]
# then re-write the disk label so we can manipulate it correctly below.
$part_line = `parted --script $rawdev print`;
if ($part_line =~ /unrecognised disk label/) {
	info($dbg,"INFO: [$rawdev] unrecognized; making a new disk label.");
	system "parted --script $rawdev mklabel gpt"; # just nuke it.
	sleep 1;
}
# and then continue....
$part_line = `parted --script $rawdev print | grep -A9 '^Number'  |tail -2 |head -1`;

# if it's blank, $part_line will be:
# Number  Start  End  Size  Type  File system  Flags
$N = @L = split(/\s+/,$part_line);

# this assumes that the output from parted is the same across various distro's
if ($L[1] ne "Start") { # it has some identifiable partitions, so delete them
	$part_nbr = int($L[1]);
	# now delete the partitions, in decr order
	for ($i=$part_nbr;$i>0;$i--){
		system "parted --script $rawdev rm $i";  # OK 1.7.1, 1.8.8
		info($dbg,"INFO: Partition [$i] on device [$rawdev] deleted!");
	}
}

# else  it's blank already, so format with new partitions. mk swap partitions on both;
# the 2nd one has to be mkfs.xxx'ed afterwards anyway
info($dbg,"INFO: Device [$rawdev] appears to be blank.  Will create new partitions");
$mk_frst_part_cmd = "parted --script $rawdev mkpartfs primary linux-swap 0 $swap_MB"; # OK 1.7.1, 1.8.8
info($dbg,"INFO: 1st parted cmd: [$mk_frst_part_cmd] ");
system "$mk_frst_part_cmd";

info($dbg,"INFO: Sleeping for 1 s...");
sleep 1;

$swap_MB += 1; # increment start point
# making 2nd partition swap also, since it's faster and will write XFS on top of it.
# OK 1.7.1, 1.8.8
$mk_scnd_part_cmd = "parted --script $rawdev mkpartfs primary linux-swap $swap_MB $disksize_MB";
info($dbg,"INFO: 2nd parted cmd: [$mk_scnd_part_cmd] ");
system "$mk_scnd_part_cmd";
info($dbg,"INFO: Sleeping for 1 s...");
sleep 1;

# make sure an automount didn't automount the partition when a partition was created
system "umount $device";

# print "paused"; $tmp = <STDIN>;

# now force a new xxxx file system onto the second partion
info($dbg,"INFO: Now writing [$ftype] filesystem onto [$device]\nwith this command: [$mkfs  $forceflag  $device]");
system "$mkfs  $forceflag  $device";
#print "paused"; $tmp=<STDIN>;

my $parted_info = `system "parted --script $rawdev print`;
info($dbg,"$parted_info");
#print "paused"; $tmp=<STDIN>;
# and write the validation file onto the new /scratch partition
# mount it again...
$mounttest = system("mount -t $ftype $device $mountpoint");
if ($mounttest == 0) {
	info($dbg,"INFO: Device [$device] mounted!\nINFO: Now touching the validation file");
	system "touch $valid_file; chmod a-w $valid_file"; # also sets mode so can't accidentally rm it.
	# checking
	if (-f "$valid_file") {
		info($dbg,"INFO: Validation file [$valid_file] checks out!");
	} else {
		die "ERROR: Something went wrong in the touch.  Please check!!\n";
	}
}

FINE:

# on perceus vnfs, /etc/fstab will already have the /dev/sda1 partition marked as swap.
my $swap_device = $rawdev . $swap_part;
info($dbg,"INFO: Executing 'swapon' on [$swap_device].");
system "swapon $swap_device";

print STDERR "INFO: format_mount.pl: Brand spankin' new disk [$rawdev] ready to go!!\n";

# following
sub info {
#	my $imp = shift; # msg importance - can use to promote messages in other situations
	my $dbg = shift; # debug level (0->nothing; 1->'.' 2->msg string)
	my $msg = shift; #
	if ($dbg==0) {return;}
	if ($dbg==1) {print ".";}
	if ($dbg>1) {print "$msg\n";}
}

# this is a much shorter, cruder approach, but it may do everything
# that perceus needs.  You prep one disk as you want it, dump the
# partition info with sfdisk, then read it back in with sfdisk again.
# from John Hanks <jbh@broadinstitute.org>
# You can't beat sfdisk if you need to script partitioning a lot of
# drives. I typically fdisk or parted one disk as a template, then
# "sfdisk -d /dev/somedisk > disk-type-part-desc.sfdisk", put that file
# somewhere accessible to all nodes and then have the nodes "cat
# disk-type-part-desc.sfdisk | sfdisk /dev/somedisk". Even writing your
# own generic recipes for sfdisk is pretty easy if you want one general
# layout to apply for any size disk.
