# The math::statistics package is part of the Tcl Library tools
# This has procedures for doing many stat applications including
# generating a normal curve.

package require math::statistics

# histProcs has procs for scaling and drawing X and Y axis

source histProcs.tcl

# set default bin size for histogram, then look to see
# if there's a command line arg to override it.

set binSize 5
if {[set pos [lsearch $argv -bin]] >= 0} {
  incr pos
  set binSize [lindex $argv $pos]
}

# Open file and read data
set if [open [lindex $argv end] r]
set d [read $if]
close $if

# Step through each line.  
# Only process lines for which clock scan is successful.
# counts is an array of how many email messages transfered 
# in an hour indexed by the hour

foreach line [split $d \n] {
  set datetime "[string range $line 0 14] 2010"
  if {[catch \
      {clock scan $datetime -format "%b %d %H:%M:%S %Y"} \
      secs]} {
    continue
  }
  set hour [expr $secs / 3600]
  incr counts($hour)
}

# Calculate some statistics.  Mean, Median and Standard Deviation

foreach nm [array names counts] {
  lappend data $counts($nm)
}

set mean [::math::statistics::mean $data]
set median [::math::statistics::median $data]
set stdev [::math::statistics::stdev $data]

# Create a histogram of the data, The bin is calculated
# by taking advantage of the truncation effect of integer division.
#  ie: 27/5 = 5, multiplied by 5 =25
#  ie: 28/5 = 5, multiplied by 5 =25
#  etc.

foreach hr [array names counts] {
  set bin [expr {($counts($hr)/$binSize)*$binSize}]
  incr histogram($bin)
}

# Find the min and max values to use to scale the graph

set sortedCounts [lsort -integer [array names histogram]]
set minCount [lindex $sortedCounts 0]
set maxCount [lindex $sortedCounts end]

# Incr the maxCount so the last value isn't at the exact end
# of the graph
incr maxCount 5

# Minimum occurances will be 0 - histograms should be 0 
# based unless you are trying to lie with statistics.

set minOccur 0
set maxOccur $histogram([lindex $sortedCounts 0])

foreach nm $sortedCounts {
  if {$histogram($nm) > $maxOccur} {
    set maxOccur $histogram($nm)
    set mode $histogram($nm)
  }
}

incr maxOccur 5

# Create and grid labels with the statistics.
set skew [expr {($mean - $mode) / $stdev}]

set row 1
foreach {txt var} {Mean mean 
                   Median median 
		   Mode mode 
		   "Standard Deviation" stdev
		   "Skew Estimate" skew} {
  set w [label .l_$txt -text $txt]
  grid $w -row $row -column 1 -sticky w
  set w [label .l_$var -textvariable $var]
  grid $w -row $row -column 2 -sticky w
  incr row
}

# Create and grid a canvas
set cvs [canvas .c -height 400 -width 500 -background white]
grid $cvs -row $row -column 1 -columnspan 2


# Put at most 10 steps on Y axis
set yStepSize [makeBetterStep [expr {$maxOccur / 10}]]
set xStepSize [makeBetterStep [expr {$maxCount / 10}]]

# Draw Y axis and labels
drawYaxis $cvs $minOccur $maxOccur 30 20 $yStepSize
drawXaxis $cvs $minCount $maxCount 30 20 $xStepSize

# Draw mean, 1 & 2 standard deviation lines, 
# mean in red, 1 stdev: orange, 2 stdev: goldenrod

set xpos [scaleValue $mean $minCount $maxCount 30 500]
$cvs create line $xpos 0 $xpos 380 -fill red

set xpos [scaleValue [expr {$mean+$stdev}] \
    $minCount $maxCount 30 500]
$cvs create line $xpos 0 $xpos 380 -fill orange
set xpos [scaleValue [expr {$mean-$stdev}] \
    $minCount $maxCount 30 500]
$cvs create line $xpos 0 $xpos 380 -fill orange

set xpos [scaleValue [expr {$mean+$stdev+$stdev}] \
    $minCount $maxCount 30 500]
$cvs create line $xpos 0 $xpos 380 -fill goldenrod
set xpos [scaleValue [expr {$mean-$stdev-$stdev}] \
    $minCount $maxCount 30 500]
$cvs create line $xpos 0 $xpos 380 -fill goldenrod

# Draw the histograms as rectangles
foreach count $sortedCounts {
  set xpos [scaleValue $count $minCount $maxCount 30 500]
  set ypos [scaleValue $histogram($count) \
      $minOccur $maxOccur 20 400]
  $cvs create rectangle $xpos 380 \
      [expr {$xpos+$binSize}] [expr {400 - $ypos}] \
      -fill blue -outline blue
}


set maxNormal [::math::statistics::pdf-normal \
    $mean $stdev $mean]
set maxY [scaleValue $mode $minOccur $maxOccur 20 400]

set prevx 30
set prevy 380
for {set x 0} {$x < $maxCount} {incr x 2} {
  set y [::math::statistics::pdf-normal $mean $stdev $x]
  set xpos [scaleValue $x $minCount $maxCount 30 500]
  set ypos [scaleValue $y 0 $maxNormal 20 $maxY]
  set ypos [expr {400-$ypos}]
  $cvs create line $prevx $prevy $xpos $ypos -fill green
  set prevx $xpos
  set prevy $ypos
}

