intrd.c
author Albert Lee <trisk@forkgnu.org>
Tue, 01 Feb 2011 16:10:01 -0500
changeset 4 a42e422f55c0
parent 3 380ada8fd621
permissions -rw-r--r--
More WIP, need to convert to uu_list.

/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */

/*
 * Copyright (c) 2005, 2010, Oracle and/or its affiliates.  All rights reserved.
 */

#include <stdio.h>
#include <errno.h>
#include <unistd.h>
#include <stdlib.h>
#include <limits.h>
#include <string.h>
#include <getopt.h>
#include <libgen.h>
#include <syslog.h>
#include <kstat.h>
#include <sys/processor.h>
#include <sys/modhash.h>

#include "intrs.h"

/* Interrupt vector info */
typedef struct ivec {
	int cookie;
	uint64_t time;
	hrtime_t crtime;
	int pil;
	int ino;
	int ihs;
	int num_ino;
	int origcpu;
	int nowcpu;
	int inum;
} ivec_t;

uu_list_pool_t *ivec_pool;

/* MSI device info */
typedef struct msi_dev {
	msi_dev_t *next;
	char *devpath[MAXPATHLEN];
	int num_intr;
	*ivec_t ivecs;
} msi_dev_t;

uu_list_pool_t *msi_dev_pool;

/* Bus info */
typedef struct bus_dev {
	bus_dev_t *next;
	char *buspath[MAXPATHLEN];
	int num_intr;
	ivec_t *ivecs;
	int is_pcplusmp;
	msi_dev_t *msi_head;
} bus_dev_t;

uu_list_pool_t *bus_dev_pool;

/* Per-CPU statistics */
typedef struct cpu_stat {
	int state;
	uint64_t tot;
	hrtime_t crtime;
	bus_dev_t *bus_head;
} cpu_stat_t;

uu_list_pool_t *cpu_stat_pool;

/* Interrupt statistics */
type def struct intr_stat {
	hrtime_t snaptime;
	*cpu_stat_t *cpus;
} intr_stat_t;

uu_list_pool_t *intr_stat_pool;

typedef enum sleeptime {
	NORMAL_SLEEPTIME = 10,		/* time to sleep between samples */
	IDLE_SLEEPTIME = 45,		/* time to sleep when idle */
	ONECPU_SLEEPTIME = 60 * 15,	/* used if only 1 CPU on system */
} sleeptime_t;

int using_scengen;			/* 1 if using scenario simulator */
int debug;
int foreground;

int max_cpus;

sleeptime_t sleeptime = NORMAL_SLEEPTIME;

float idle_intrload = 0.1; 		/*  idle if interrupt load < 10% */

float timerange_toohi = 0.1;
int statslen = 60;	/* time period (in secs) to keep in @deltas */

int main(int argc, char **argv)
{
	const char *cmdname;
	kstat_ctl_t *kc;
	kstat_t *ksp;
	intr_stat_t stat;
	char c;

	max_cpus = sysconf(_SC_CPUID_MAX) + 1;

	cmdname = basename(argv[0]);
/*
 * Parse arguments. intrd does not accept any public arguments; the two
 * arguments below are meant for testing purposes. -D generates a significant
 * amount of syslog output. -S <filename> loads the filename as a perl
 * script. That file is expected to implement a kstat "simulator" which
 * can be used to feed information to intrd and verify intrd's responses.
 */
 	while ((c = getopt(argc, argv, "S:Df")) != EOF) {
		switch (c) {
		case 'S':
			using_scengen = 1;
			foreground = 1;
			load_simulator(optarg);
			break;
		case 'D':
			debug = 1;
			break;
		case 'f':
			foreground = 1;
			break;
		default:
		}
	}

	if (!foreground) {
		if (daemon(0, 0) == -1) {
			return 1;
		}
	}

	if (!using_scengen) {
		openlog(cmdname, LOG_PID, LOG_DAEMON);
		(void) setlogmask(LOG_UPTO(debug ? LOG_DEBUG : LOG_INFO));
	}

	if (!using_scengen) {
		kc = kstat_open();
		if (kc == NULL) {
			return 1;
		}
	} else {
		/* scengen not implemented */
		return 1;
	}

/*
 * If no pci_intrs kstats were found, we need to exit, but we can't because
 * SMF will restart us and/or report an error to the administrator. But
 * there's nothing an administrator can do. So print out a message to syslog
 * and silently pause forever.
 */
	for (ksp = kc->kc_chain; ksp != NULL; ksp = ksp->ks_next) {
		if ((ksp->ks_type == KSTAT_TYPE_NAMED) &&
		    (strcmp(ksp->ks_module, "pci_intrs") == 0)) {
		    break;
		}
	}
	if (ksp == NULL) {
		kstat_close(kc);
		syslog(LOG_INFO, "no interrupts were found: " \
			"your I/O bus may not yet be supported\n");
		do {} while (sleep(ONECPU_SLEEPTIME) == 0);
		return 0;
	}

	stat.cpus = malloc(sizeof (cpu_stat_t) * max_cpus);
	if (stat.cpus == NULL) {
		return 1;
	}
}


static int verify(int condition, const char *msg)
{
	int bad = !condition;
	if (bad) {
		syslog(LOG_DEBUG, "VERIFY: %s", msg);
	}
	return bad;
}

static void load_simulator(const char *file)
{
}

static int getstat($$);
/*
int generate_delta($$);
int compress_deltas($);
int dumpdelta($);

int goodness($);
int imbalanced($$);
int do_reconfig($);

int goodness_cpu($$);		# private function
int move_intr($$$$);		# private function
int ivecs_to_string(@);		# private function
int do_find_goal($$$$);		# private function
int find_goal($$);		# private function
int do_reconfig_cpu2cpu($$$$);	# private function
int do_reconfig_cpu($$$);	
*/


/*
 *
 * What follow are the basic data structures routines of intrd.
 *
 * getstat() is responsible for reading the kstats and generating a "stat" hash.
 *
 * generate_delta() is responsible for taking two "stat" hashes and creating
 * a new "delta" hash that represents what has changed over time.
 *
 * compress_deltas() is responsible for taking a list of deltas and generating
 * a single delta hash that encompasses all the time periods described by the
 * deltas.
*/


/*
 *
 * getstat() is handed a reference to a kstat and generates a hash, returned
 * by reference, containing all the fields from the kstats which we need.
 * If it returns the scalar 0, it failed to gather the kstats, and the caller
 * should react accordingly.
 *
 * getstat() is also responsible for maintaining a reasonable $sleeptime.
 *
 * {"snaptime"}          kstat's snaptime
 * {<cpuid>}             one hash reference per online cpu
 *  ->{"tot"}            == cpu:<cpuid>:sys:cpu_nsec_{user + kernel + idle}
 *  ->{"crtime"}         == cpu:<cpuid>:sys:crtime
 *  ->{"ivecs"}
 *     ->{<cookie#>}     iterates over pci_intrs::<nexus>:cookie
 *        ->{"time"}     == pci_intrs:<ivec#>:<nexus>:time (in nsec)
 *        ->{"pil"}      == pci_intrs:<ivec#>:<nexus>:pil
 *        ->{"crtime"}   == pci_intrs:<ivec#>:<nexus>:crtime
 *        ->{"ino"}      == pci_intrs:<ivec#>:<nexus>:ino
 *        ->{"num_ino"}  == num inos of single device instance sharing this entry
 *				Will be > 1 on pcplusmp X86 systems for devices
 *				with multiple MSI interrupts.
 *        ->{"buspath"}  == pci_intrs:<ivec#>:<nexus>:buspath
 *        ->{"name"}     == pci_intrs:<ivec#>:<nexus>:name
 *        ->{"ihs"}      == pci_intrs:<ivec#>:<nexus>:ihs
 *
*/

int getstat(kstat_ctl_t *kc, intr_stat_t *stat)
{
	int cpucnt = 0;
	kstat_t *ksp;
	hrtime_t minsnap, maxsnap;

	/* Hash of hash which matches (MSI device, ino) combos to kstats. */
	msi_dev_t *msidevs;

	/*
	 * kstats are not generated atomically. Each kstat hierarchy will
	 * have been generated within the kernel at a different time. On a
	 * thrashing system, we may not run quickly enough in order to get
	 * coherent kstat timing information across all the kstats. To
	 * determine if this is occurring, $minsnap/$maxsnap are used to
	 * find the breadth between the first and last snaptime of all the
	 * kstats we access. $maxsnap - $minsnap roughly represents the
	 * total time taken up in getstat(). If this time approaches the
	 * time between snapshots, our results may not be useful.
	*/

	minsnap = -1;		/* snaptime is always a positive number */
	maxsnap = minsnap;

	/*
	 * iterate over the cpus in cpu:<cpuid>::. check
	 * cpu_info:<cpuid>:cpu_info<cpuid>:state to make sure the
	 * processor is "on-line". if not, it isn't accepting interrupts
	 * and doesn't concern us.
	 *
	 * record cpu:<cpuid>:sys:snaptime, and check $minsnap/$maxsnap.
	 */

	cpu_stats = stat->cpus;
	bzero(cpu_stats, sizeof (cpu_stat_t) * max_cpus);

	for (ksp = kc->kc_chain; ksp != null; ksp = ksp->ks_next) {
		kstat_t *ksp_sys;
		kstat_named_t *knp;
		int cpu;
		hrtime_t snaptime;

		if ((ksp->ks_type != kstat_type_named) ||
		    strcmp(ksp->ks_module, "cpu_info") ||
		    (kstat_read(kc, ksp) == -1)) {
		    continue;
		}
		knp = kstat_data_lookup(ksp, "state");
		if ((knp == NULL) || strcmp(knp->name, PS_ONLINE) ||
		    ((cpu = ksp->ks_instance) >= max_cpus)) {
			continue;
		}
		ksp_sys = kstat_lookup(kc, "cpu", cpu, "sys");
		if ((ksp_sys == NULL) || (kstat_read(kc, ksp_sys) == -1)) {
			continue;
		}
		cpu_stats[cpu].state = P_ONLINE;
		knp = ksp_sys->ks_data;
		for (i = 0; i < ksp_sys->ks_ndata; i++) {
			if ((strcmp(knp[i].name, "cpu_nsec_idle") == 0) ||
			    (strcmp(knp[i].name, "cpu_nsec_user") == 0) ||
			    (strcmp(knp[i].name, "cpu_nsec_kernel") == 0)) {
				cpu_stats[cpu].tot += knp[i].value.ui64;
		}
		cpu_stats[cpu].crtime = ksp_sys->crtime;
		snaptime = ksp_sys->snaptime;
		if (minsnap == -1 || snaptime < minsnap) {
			minsnap = snaptime;
		}
		if (snaptime > maxsnap) {
			maxsnap = snaptime;
		}
		cpucnt++;

	}

	if (cpucnt <= 1) {
		sleeptime = ONECPU_SLEEPTIME;
		return 0;	/* nothing to do with 1 CPU */
	}

	/*
	 * Iterate over the ivecs. If the cpu is not on-line, ignore the
	 * ivecs mapped to it, if any.
	 *
	 * Record pci_intrs:{inum}:<nexus>:time, snaptime, crtime, pil,
	 * ino, name, and buspath. Check $minsnap/$maxsnap.
	 */

	for (ksp = kc->kc_chain; ksp != NULL; ksp = ksp->ks_next) {
		kstat_named_t *knp;
		int cpu;
		int ino;
		cpu_stat_t *cpup;
		bus_dev_t *busp;
		bus_dev_t *bus_last;
		ivec_t *ivecp;
		hrtime_t snaptime;

		if ((ksp->ks_type != KSTAT_TYPE_NAMED) ||
		    strcmp(ksp->ks_module, "pci_intrs") ||
		    (kstat_read(kc, ksp) == -1)) {
			continue;
		}
		knp = kstat_data_lookup(ksp, "cpu");
		if ((knp == NULL) || ((cpu = knp->value.ui32) >= max_cpus) ||
		    (cpu_stats[cpu].state != P_ONLINE)) {
			continue;
		}
		cpup = &cpu_stats[cpu];
		knp = kstat_data_lookup(ksp, "type");
		if ((knp == NULL) || strcmp(knp->value.c, "disabled")) {
			continue;
		}
		knp = kstat_data_lookup(ksp, "buspath");
		if (knp == NULL) {
			continue;
		}
		
		for (bus_last = NULL, busp = cpup->bus_head; busp != NULL;
		     bus_last = busp, busp = busp->next) {
			if (strcmp(knp->value.c, busp->buspath) == 0) {
				break;
			}
		}

		if (busp == NULL) {
			busp = malloc(sizeof (bus_dev_t));
			if (busp == NULL) {
				return -1;
			}

			busp->next = NULL;

			strlcpy(busp->buspath, knp->value.c, MAXPATHLEN);
			busp->is_pcplusmp =
			    intrinfo(busp->buspath, &(busp->num_intr));

			busp->ivecs = malloc(sizeof (ivec_t) * busp->num_intr);
			if (busp->ivecs == NULL) {
				free(busp);
				return -1;
			}
			bzero(busp->ivecs, sizeof (ivec_t) * busp->num_intr);

			if (bus_last == NULL) {
				cpup->bus_head = busp;
			} else {
				bus_last->next = busp;
			}
		}
		knp = kstat_data_lookup(ksp, "ino");
		if ((knp == NULL) ||
		    ((ino = knp->value.ui32) >= busp->num_intr)) {
			continue;
		}
		ivecp = &(busp->ivecs[ino]);

		knp = kstat_data_lookup(ksp, "time");
		if (knp == NULL) {
			continue;
		}
		ivecp->time = knp->value.ui64;

		if (busp->is_pcplusmp) {
			knp = kstat_data_lookup(ksp, "type");
			if (knp == NULL) {
				continue;
			}
			if (strcmp(knp->value.c, "msi") == 0) {
				for (msi_last = NULL, msip = busp->msi_head;
				     msip != NULL;
				     msi_last = msip, msip = msip->next) {
					if (ivecp->cookie == msip->cookie) {
						break;
					}
			}

		}
			
		ivecp->num_ino = 1;
		ivecp->crtime = ksp->crtime;
		snaptime = ksp->snaptime;
		if (minsnap == -1 || snaptime < minsnap) {
			minsnap = snaptime;
		}
		if (snaptime > maxsnap) {
			maxsnap = snaptime;
		}
	}

	/*
	 * All MSI interrupts of a device instance share a single MSI address.
	 * On X86 systems with an APIC, this MSI address is interpreted as CPU
	 * routing info by the APIC.  For this reason, on these platforms, all
	 * interrupts for MSI devices must be moved to the same CPU at the same
	 * time.
	 *
	 * Since all interrupts will be on the same CPU on these platforms, all
	 * interrupts can be consolidated into one ivec entry.  For such devices,
	 * num_ino will be > 1 to denote that a group move is needed.  
	 */

	/*
	 * Loop thru all MSI devices on X86 pcplusmp systems.
	 * Nop on other systems.
	 */


	/*
	 * We define the timerange as the amount of time spent gathering the
	 * various kstats, divided by our sleeptime. If we take a lot of time
	 * to access the kstats, and then we create a delta comparing these
	 * kstats with a prior set of kstats, that delta will cover
	 * substaintially different amount of time depending upon which
	 * interrupt or CPU is being examined.
	 *
	 * By checking the timerange here, we guarantee that any deltas
	 * created from these kstats will contain self-consistent data,
	 * in that all CPUs and interrupts cover a similar span of time.
	 *
	 * $timerange_toohi is the upper bound. Any timerange above
	 * this is thrown out as garbage. If the stat is safely within this
	 * bound, we treat the stat as representing an instant in time, rather
	 * than the time range it actually spans. We arbitrarily choose minsnap
	 * as the snaptime of the stat.
	 */

	 stat->snaptime = minsnap;
	 if (((maxsnap - minsnap) / sleeptime) > timerange_toohi) {
	 	return 0;
	}
	return 1;
}