Imports System.Math
Imports System.Collections.Generic
'''
''' Basic Stats class to keep track of summary statistics
'''
'''
Public NotInheritable Class CBasicSummaryStats
Public dMean As Double
Public dStdDev As Double
Public dSum As Double
Public dMeanSq As Double
Public dVariance As Double
Public dSumXSq As Double
End Class
Public NotInheritable Class CDataSortedByDate
Public rgDataSortedByDate As New ArrayList(128) ' of CDataEntry
Public rgDataLookupByDate As New Hashtable(128) ' (Of DateTime, CDataEntry)
Public dtFirstDataEntry As DateTime
Public dtLastDataEntry As DateTime
Public summaryStats As CSummaryStats
Public Sub CalculateAllCorrelationCoeffs(ByRef meansd As CSummaryStats)
' Pearson's r = N * sum(XY) - sum(X)sum(Y)
' -------------------------------------------------------------
' sqrt((N*sum(X^2)) - sum(X)^2) * sqrt((N*sum(Y^2)) - sum(Y)^2)
Dim Numerator As Double
Dim Denominator As Double
Dim Denom1 As Double
Dim Denom2 As Double
With meansd
Numerator = (.nDataPoints * .dSumX1Y) - (.X1Stats.dSum * .YStats.dSum)
Denom1 = (.nDataPoints * .dSumX1Sq) - (.X1Stats.dSum * .X1Stats.dSum)
Denom2 = (.nDataPoints * .dSumYSq) - (.YStats.dSum * .YStats.dSum)
Denominator = Sqrt(Denom1) * Sqrt(Denom2)
.RYX1 = Numerator / Denominator
Numerator = (.nDataPoints * .dSumX2Y) - (.X2Stats.dSum * .YStats.dSum)
Denom1 = (.nDataPoints * .dSumX2Sq) - (.X2Stats.dSum * .X2Stats.dSum)
Denom2 = (.nDataPoints * .dSumYSq) - (.YStats.dSum * .YStats.dSum)
Denominator = Sqrt(Denom1) * Sqrt(Denom2)
.RYX2 = Numerator / Denominator
Numerator = (.nDataPoints * .dSumX1X2) - (.X1Stats.dSum * .X2Stats.dSum)
Denom1 = (.nDataPoints * .dSumX1Sq) - (.X1Stats.dSum * .X1Stats.dSum)
Denom2 = (.nDataPoints * .dSumX2Sq) - (.X2Stats.dSum * .X2Stats.dSum)
Denominator = Sqrt(Denom1) * Sqrt(Denom2)
.RX1X2 = Numerator / Denominator
' Now calculate partial correlations
' rX1Y·X2 = rX1Y — (rX1X2)(rYX2)
' ------------------------------
' sqrt[1—rX1X2^2] * sqrt[1—rYX2^2]
' rX1,Y . X2
Numerator = .RYX1 - (.RX1X2 * .RYX2)
Denom1 = (1 - (.RX1X2 * .RX1X2))
Denom2 = (1 - (.RYX2 * .RYX2))
Denominator = Sqrt(Denom1) * Sqrt(Denom2)
.PartialRX1_Y_X2 = Numerator / Denominator
' rX2,Y . X1
Numerator = .RYX2 - (.RX1X2 * .RYX1)
Denom1 = (1 - (.RX1X2 * .RX1X2))
Denom2 = (1 - (.RYX1 * .RYX1))
Denominator = Sqrt(Denom1) * Sqrt(Denom2)
.PartialRX2_Y_X1 = Numerator / Denominator
' Semipartial correlations are next
' These are used to calculate the beta weight
' of each independent variable in the multiple
' regression equation :)
' rY·X2(X1) = rYX2 — (rYX1)(rX1X2)
' ------------------------------
' sqrt[1—rX1X2^2]
' rY·X1(X2)
Numerator = .RYX1 - (.RYX2 * .RX1X2)
Denom1 = (1 - (.RX1X2 * .RX1X2))
Denominator = Sqrt(Denom1)
.SemiPartialRY_X1X2 = Numerator / Denominator
' rY·X2(X1)
Numerator = .RYX2 - (.RYX1 * .RX1X2)
Denom1 = (1 - (.RX1X2 * .RX1X2))
Denominator = Sqrt(Denom1)
.SemiPartialRY_X2X1 = Numerator / Denominator
.BetaX1 = .SemiPartialRY_X1X2 * Sqrt(1 - (.RX1X2 * .RX1X2))
.BetaX2 = .SemiPartialRY_X2X1 * Sqrt(1 - (.RX1X2 * .RX1X2))
End With
End Sub
'''
''' Helper function: Find a data entry given the date
'''
''' the date for which to find a data entry
''' The CDataEntry that corresponds to the data, otherwise
''' Nothing if there is no data entry for that date
'''
Public Function FindDate(ByVal dt As DateTime) As CDataEntry
If rgDataLookupByDate.ContainsKey(dt) = True Then
Return rgDataLookupByDate(dt)
Else
Return Nothing
End If
End Function
Public Sub CalculateSummaryStatistics()
summaryStats = New CSummaryStats
Dim nCnt As Integer = 0
Dim das As CDataEntry
nCnt = rgDataSortedByDate.Count
summaryStats.nDataPoints = CDbl(nCnt)
Dim yValue As Double
Dim x1Value As Double
Dim x2Value As Double
Dim xLinear As Double
Dim xSQ As Double
Dim x2SQ As Double
Dim ySQ As Double
Dim dtMinDt As DateTime = DateTime.MaxValue
Dim fFirst As Boolean = True
With summaryStats
For Each das In rgDataSortedByDate
If fFirst = True Then
dtMinDt = das.dt
fFirst = False
End If
' Linear trend
' x1Value = CDbl(CInt(ts.TotalDays))
' Let's look for a quadratic trend...
' Could also look for a logarithmic trend...
' x1Value = Log(x1Value)
Dim ts As TimeSpan
ts = das.dt.Date.Subtract(dtMinDt)
' Note: we are not using the GetX1FromDate function
' but we could be - it's a simple linear transformation
' but the below code makes it explicit what x1 represents
xLinear = ts.TotalDays() + 1
x1Value = xLinear * xLinear
' The number of weekend days and weekday days is not normally distributed and
' is categorical (nominal) in nature - using point biserial with categories
' "weekend" and "weekday" with probabilities
' 2/7 and 5/7 for (p) and (1-p) would work. Instead, let's keep it simple and use
' Pearson's r. Weekends will have a value of 2.5 and weekdays a value of -1.
x2Value = GetX2FromDate(das.dt)
das.dXLinear = xLinear
das.dX = x1Value
das.dX2 = x2Value
yValue = das.dY
.X1Stats.dSum += x1Value
.X2Stats.dSum += x2Value
.YStats.dSum += yValue
.dSumX1Y += x1Value * yValue
.dSumX2Y += x2Value * yValue
xSQ = x1Value * x1Value
x2SQ = x2Value * x2Value
ySQ = yValue * yValue
.dSumX1Sq += xSQ
.dSumX2Sq += x2SQ
.dSumYSq += ySQ
.dSumX1X2 += x1Value * x2Value
Next
.X1Stats.dMean = summaryStats.X1Stats.dSum / summaryStats.nDataPoints
.X2Stats.dMean = summaryStats.X2Stats.dSum / summaryStats.nDataPoints
.YStats.dMean = summaryStats.YStats.dSum / summaryStats.nDataPoints
.X1Stats.dMeanSq = .X1Stats.dMean * .X1Stats.dMean
.X2Stats.dMeanSq = .X2Stats.dMean * .X2Stats.dMean
.YStats.dMeanSq = .YStats.dMean * .YStats.dMean
.X1Stats.dVariance = (.dSumX1Sq / .nDataPoints) - .X1Stats.dMeanSq
.X2Stats.dVariance = (.dSumX2Sq / .nDataPoints) - .X2Stats.dMeanSq
.YStats.dVariance = (.dSumYSq / .nDataPoints) - .YStats.dMeanSq
If .X1Stats.dVariance <= 0 Then
' No variability, std dev is 0
.X1Stats.dStdDev = 0
Else
.X1Stats.dStdDev = Math.Sqrt(.X1Stats.dVariance)
End If
If .X2Stats.dVariance <= 0 Then
' No variability, std dev is 0
.X2Stats.dStdDev = 0
Else
.X2Stats.dStdDev = Math.Sqrt(.X2Stats.dVariance)
End If
If .YStats.dVariance <= 0 Then
' No variability, std dev is 0
.YStats.dStdDev = 0
Else
.YStats.dStdDev = Math.Sqrt(.YStats.dVariance)
End If
End With
' Calculate the correlation between variables X1, X2, Y
CalculateAllCorrelationCoeffs(summaryStats)
' Now that we have our r's, we can calculate Betas and Intercepts
With summaryStats
.B1 = .RYX1 * (.YStats.dStdDev / .X1Stats.dStdDev)
.A1 = .YStats.dMean - (.B1 * .X1Stats.dMean)
.B2 = .RYX2 * (.YStats.dStdDev / .X2Stats.dStdDev)
.A2 = .YStats.dMean - (.B2 * .X2Stats.dMean)
.BX1_12 = .BetaX1 * (.YStats.dStdDev / .X1Stats.dStdDev)
.BX2_12 = .BetaX2 * (.YStats.dStdDev / .X2Stats.dStdDev)
.A12 = .YStats.dMean - (.BX1_12 * .X1Stats.dMean) - (.BX2_12 * .X2Stats.dMean)
End With
End Sub
End Class
'''
''' Summary statistics for X1 (time), X2 (weekend/weekday) and
''' Y (# of sales, our Dependent Variable (DV))
'''
''' Also keeps track of sum of X1*Y, Sum X1^2, etc,
''' for calculating correlation coefficients
Public NotInheritable Class CSummaryStats
Private dN As Double
'''
''' The number of data points available, stored as a double
''' for easy calculation
'''
'''
''' The number of data points available, as a double
'''
Public Property nDataPoints() As Double
Get
Return dN
End Get
Set(ByVal value As Double)
dN = value
End Set
End Property
Public X1Stats As New CBasicSummaryStats
Public X2Stats As New CBasicSummaryStats
Public YStats As New CBasicSummaryStats
' Sum of X1 * Y and Sum of X1 squared (X1^2)
Public dSumX1Y As Double
Public dSumX1Sq As Double
' Sum of X2 * Y and Sum of X2 squared (X2^2)
Public dSumX2Y As Double
Public dSumX2Sq As Double
' Sum of X1 * X2 and Sum of Y squared (Y^2)
Public dSumX1X2 As Double
Public dSumYSq As Double
' Correlation between Y and X1, Y and X2, X1 and X2
Public RYX1 As Double
Public RYX2 As Double
Public RX1X2 As Double
' Partial correlations
Public PartialRX1_Y_X2 As Double
Public PartialRX2_Y_X1 As Double
' Semipartial correlations
Public SemiPartialRY_X1X2 As Double
Public SemiPartialRY_X2X1 As Double
' Regression-related, Beta weight of X1, X2
Public BetaX1 As Double
Public BetaX2 As Double
Public B1 As Double
Public A1 As Double
Public B2 As Double
Public A2 As Double
Public BX1_12 As Double
Public BX2_12 As Double
Public A12 As Double
'''
''' Best prediction for Y only knowing X1
'''
''' X1
''' Best prediction for Y, using least-squares principle
'''
Public Function PredictYFromX1(ByVal Xval As Double) As Double
Return (B1 * Xval) + A1
End Function
'''
''' Best prediction for Y only knowing X2
'''
''' X2
''' Best prediction for Y, using least-squares principle
'''
Public Function PredictYFromX2(ByVal X2val As Double) As Double
Return (B2 * X2val) + A2
End Function
'''
''' Best prediction for Y knowing both X1 and X2
'''
''' X1
''' X2
''' Simple multiple regression best prediction for Y, using least-squares principle
'''
Public Function PredictYFromX1X2(ByVal X1val As Double, ByVal X2val As Double) As Double
Return (BX1_12 * X1val) + (BX2_12 * X2val) + A12
End Function
End Class
Public Module SummaryStatsModule
'''
''' Calculates the expected values if we only knew X1, X2,
''' or knew both X1 and X2. Also calculates the residuals.
'''
''' Date and sales entry data structure
'''
Public Sub InitResiduals(ByVal sd As CDataSortedByDate)
Dim das As CDataEntry ' date and sales entry
With sd.summaryStats
For Each das In sd.rgDataSortedByDate
das.dExpectedX1 = .PredictYFromX1(das.dX)
das.dResidualX1 = das.dY - das.dExpectedX1
das.dExpectedX2 = .PredictYFromX2(das.dX2)
das.dResidualX2 = das.dY - das.dExpectedX2
das.dExpectedX1X2 = .PredictYFromX1X2(das.dX, das.dX2)
das.dResidualX1X2 = das.dY - das.dExpectedX1X2
Next
End With
End Sub
'''
''' Turn the passage of time into number of days elapsed,
''' transformed to change scale and ensure non-zero values
'''
''' The date
''' The oldest date in the dataset
''' An "X" value based on dt
'''
Public Function GetXLinearFromDate(ByVal dt As DateTime, ByVal dtOldest As Date) As Double
Dim ts As TimeSpan
ts = dt.Date.Subtract(dtOldest)
Return CDbl(CInt(ts.TotalDays) + 1) * 0.1
End Function
'''
''' Turn the passage of time into number of days elapsed, squared,
''' transformed to change scale and ensure non-zero values
'''
''' The date
''' The oldest date in the dataset
''' An "X" value (quadratic passage of time) based on dt
'''
Public Function GetX1FromDate(ByVal dt As DateTime, ByVal dtOldest As Date) As Double
Dim ts As TimeSpan
ts = dt.Date.Subtract(dtOldest)
Dim d As Double = CDbl(CInt(ts.TotalDays) + 1)
Return d * d * 0.01
End Function
'''
''' Assigns a number to weekday vs. weekend, given the date
'''
''' The date
''' -1 for weekday, +2.5 for weekend
'''
Public Function GetX2FromDate(ByVal dt As DateTime) As Double
Const dWeekendVal As Double = 2.5
Const dWeekdayVal As Double = -1.0
Dim d As Double
Select Case dt.DayOfWeek
Case Is = DayOfWeek.Saturday
d = dWeekendVal
Case Is = DayOfWeek.Sunday
d = dWeekendVal
Case Else
d = dWeekdayVal
End Select
Return d
End Function
End Module