Imports System.Math Imports System.Collections.Generic ''' ''' Basic Stats class to keep track of summary statistics ''' ''' Public NotInheritable Class CBasicSummaryStats Public dMean As Double Public dStdDev As Double Public dSum As Double Public dMeanSq As Double Public dVariance As Double Public dSumXSq As Double End Class Public NotInheritable Class CDataSortedByDate Public rgDataSortedByDate As New ArrayList(128) ' of CDataEntry Public rgDataLookupByDate As New Hashtable(128) ' (Of DateTime, CDataEntry) Public dtFirstDataEntry As DateTime Public dtLastDataEntry As DateTime Public summaryStats As CSummaryStats Public Sub CalculateAllCorrelationCoeffs(ByRef meansd As CSummaryStats) ' Pearson's r = N * sum(XY) - sum(X)sum(Y) ' ------------------------------------------------------------- ' sqrt((N*sum(X^2)) - sum(X)^2) * sqrt((N*sum(Y^2)) - sum(Y)^2) Dim Numerator As Double Dim Denominator As Double Dim Denom1 As Double Dim Denom2 As Double With meansd Numerator = (.nDataPoints * .dSumX1Y) - (.X1Stats.dSum * .YStats.dSum) Denom1 = (.nDataPoints * .dSumX1Sq) - (.X1Stats.dSum * .X1Stats.dSum) Denom2 = (.nDataPoints * .dSumYSq) - (.YStats.dSum * .YStats.dSum) Denominator = Sqrt(Denom1) * Sqrt(Denom2) .RYX1 = Numerator / Denominator Numerator = (.nDataPoints * .dSumX2Y) - (.X2Stats.dSum * .YStats.dSum) Denom1 = (.nDataPoints * .dSumX2Sq) - (.X2Stats.dSum * .X2Stats.dSum) Denom2 = (.nDataPoints * .dSumYSq) - (.YStats.dSum * .YStats.dSum) Denominator = Sqrt(Denom1) * Sqrt(Denom2) .RYX2 = Numerator / Denominator Numerator = (.nDataPoints * .dSumX1X2) - (.X1Stats.dSum * .X2Stats.dSum) Denom1 = (.nDataPoints * .dSumX1Sq) - (.X1Stats.dSum * .X1Stats.dSum) Denom2 = (.nDataPoints * .dSumX2Sq) - (.X2Stats.dSum * .X2Stats.dSum) Denominator = Sqrt(Denom1) * Sqrt(Denom2) .RX1X2 = Numerator / Denominator ' Now calculate partial correlations ' rX1Y·X2 = rX1Y — (rX1X2)(rYX2) ' ------------------------------ ' sqrt[1—rX1X2^2] * sqrt[1—rYX2^2] ' rX1,Y . X2 Numerator = .RYX1 - (.RX1X2 * .RYX2) Denom1 = (1 - (.RX1X2 * .RX1X2)) Denom2 = (1 - (.RYX2 * .RYX2)) Denominator = Sqrt(Denom1) * Sqrt(Denom2) .PartialRX1_Y_X2 = Numerator / Denominator ' rX2,Y . X1 Numerator = .RYX2 - (.RX1X2 * .RYX1) Denom1 = (1 - (.RX1X2 * .RX1X2)) Denom2 = (1 - (.RYX1 * .RYX1)) Denominator = Sqrt(Denom1) * Sqrt(Denom2) .PartialRX2_Y_X1 = Numerator / Denominator ' Semipartial correlations are next ' These are used to calculate the beta weight ' of each independent variable in the multiple ' regression equation :) ' rY·X2(X1) = rYX2 — (rYX1)(rX1X2) ' ------------------------------ ' sqrt[1—rX1X2^2] ' rY·X1(X2) Numerator = .RYX1 - (.RYX2 * .RX1X2) Denom1 = (1 - (.RX1X2 * .RX1X2)) Denominator = Sqrt(Denom1) .SemiPartialRY_X1X2 = Numerator / Denominator ' rY·X2(X1) Numerator = .RYX2 - (.RYX1 * .RX1X2) Denom1 = (1 - (.RX1X2 * .RX1X2)) Denominator = Sqrt(Denom1) .SemiPartialRY_X2X1 = Numerator / Denominator .BetaX1 = .SemiPartialRY_X1X2 * Sqrt(1 - (.RX1X2 * .RX1X2)) .BetaX2 = .SemiPartialRY_X2X1 * Sqrt(1 - (.RX1X2 * .RX1X2)) End With End Sub ''' ''' Helper function: Find a data entry given the date ''' ''' the date for which to find a data entry ''' The CDataEntry that corresponds to the data, otherwise ''' Nothing if there is no data entry for that date ''' Public Function FindDate(ByVal dt As DateTime) As CDataEntry If rgDataLookupByDate.ContainsKey(dt) = True Then Return rgDataLookupByDate(dt) Else Return Nothing End If End Function Public Sub CalculateSummaryStatistics() summaryStats = New CSummaryStats Dim nCnt As Integer = 0 Dim das As CDataEntry nCnt = rgDataSortedByDate.Count summaryStats.nDataPoints = CDbl(nCnt) Dim yValue As Double Dim x1Value As Double Dim x2Value As Double Dim xLinear As Double Dim xSQ As Double Dim x2SQ As Double Dim ySQ As Double Dim dtMinDt As DateTime = DateTime.MaxValue Dim fFirst As Boolean = True With summaryStats For Each das In rgDataSortedByDate If fFirst = True Then dtMinDt = das.dt fFirst = False End If ' Linear trend ' x1Value = CDbl(CInt(ts.TotalDays)) ' Let's look for a quadratic trend... ' Could also look for a logarithmic trend... ' x1Value = Log(x1Value) Dim ts As TimeSpan ts = das.dt.Date.Subtract(dtMinDt) ' Note: we are not using the GetX1FromDate function ' but we could be - it's a simple linear transformation ' but the below code makes it explicit what x1 represents xLinear = ts.TotalDays() + 1 x1Value = xLinear * xLinear ' The number of weekend days and weekday days is not normally distributed and ' is categorical (nominal) in nature - using point biserial with categories ' "weekend" and "weekday" with probabilities ' 2/7 and 5/7 for (p) and (1-p) would work. Instead, let's keep it simple and use ' Pearson's r. Weekends will have a value of 2.5 and weekdays a value of -1. x2Value = GetX2FromDate(das.dt) das.dXLinear = xLinear das.dX = x1Value das.dX2 = x2Value yValue = das.dY .X1Stats.dSum += x1Value .X2Stats.dSum += x2Value .YStats.dSum += yValue .dSumX1Y += x1Value * yValue .dSumX2Y += x2Value * yValue xSQ = x1Value * x1Value x2SQ = x2Value * x2Value ySQ = yValue * yValue .dSumX1Sq += xSQ .dSumX2Sq += x2SQ .dSumYSq += ySQ .dSumX1X2 += x1Value * x2Value Next .X1Stats.dMean = summaryStats.X1Stats.dSum / summaryStats.nDataPoints .X2Stats.dMean = summaryStats.X2Stats.dSum / summaryStats.nDataPoints .YStats.dMean = summaryStats.YStats.dSum / summaryStats.nDataPoints .X1Stats.dMeanSq = .X1Stats.dMean * .X1Stats.dMean .X2Stats.dMeanSq = .X2Stats.dMean * .X2Stats.dMean .YStats.dMeanSq = .YStats.dMean * .YStats.dMean .X1Stats.dVariance = (.dSumX1Sq / .nDataPoints) - .X1Stats.dMeanSq .X2Stats.dVariance = (.dSumX2Sq / .nDataPoints) - .X2Stats.dMeanSq .YStats.dVariance = (.dSumYSq / .nDataPoints) - .YStats.dMeanSq If .X1Stats.dVariance <= 0 Then ' No variability, std dev is 0 .X1Stats.dStdDev = 0 Else .X1Stats.dStdDev = Math.Sqrt(.X1Stats.dVariance) End If If .X2Stats.dVariance <= 0 Then ' No variability, std dev is 0 .X2Stats.dStdDev = 0 Else .X2Stats.dStdDev = Math.Sqrt(.X2Stats.dVariance) End If If .YStats.dVariance <= 0 Then ' No variability, std dev is 0 .YStats.dStdDev = 0 Else .YStats.dStdDev = Math.Sqrt(.YStats.dVariance) End If End With ' Calculate the correlation between variables X1, X2, Y CalculateAllCorrelationCoeffs(summaryStats) ' Now that we have our r's, we can calculate Betas and Intercepts With summaryStats .B1 = .RYX1 * (.YStats.dStdDev / .X1Stats.dStdDev) .A1 = .YStats.dMean - (.B1 * .X1Stats.dMean) .B2 = .RYX2 * (.YStats.dStdDev / .X2Stats.dStdDev) .A2 = .YStats.dMean - (.B2 * .X2Stats.dMean) .BX1_12 = .BetaX1 * (.YStats.dStdDev / .X1Stats.dStdDev) .BX2_12 = .BetaX2 * (.YStats.dStdDev / .X2Stats.dStdDev) .A12 = .YStats.dMean - (.BX1_12 * .X1Stats.dMean) - (.BX2_12 * .X2Stats.dMean) End With End Sub End Class ''' ''' Summary statistics for X1 (time), X2 (weekend/weekday) and ''' Y (# of sales, our Dependent Variable (DV)) ''' ''' Also keeps track of sum of X1*Y, Sum X1^2, etc, ''' for calculating correlation coefficients Public NotInheritable Class CSummaryStats Private dN As Double ''' ''' The number of data points available, stored as a double ''' for easy calculation ''' ''' ''' The number of data points available, as a double ''' Public Property nDataPoints() As Double Get Return dN End Get Set(ByVal value As Double) dN = value End Set End Property Public X1Stats As New CBasicSummaryStats Public X2Stats As New CBasicSummaryStats Public YStats As New CBasicSummaryStats ' Sum of X1 * Y and Sum of X1 squared (X1^2) Public dSumX1Y As Double Public dSumX1Sq As Double ' Sum of X2 * Y and Sum of X2 squared (X2^2) Public dSumX2Y As Double Public dSumX2Sq As Double ' Sum of X1 * X2 and Sum of Y squared (Y^2) Public dSumX1X2 As Double Public dSumYSq As Double ' Correlation between Y and X1, Y and X2, X1 and X2 Public RYX1 As Double Public RYX2 As Double Public RX1X2 As Double ' Partial correlations Public PartialRX1_Y_X2 As Double Public PartialRX2_Y_X1 As Double ' Semipartial correlations Public SemiPartialRY_X1X2 As Double Public SemiPartialRY_X2X1 As Double ' Regression-related, Beta weight of X1, X2 Public BetaX1 As Double Public BetaX2 As Double Public B1 As Double Public A1 As Double Public B2 As Double Public A2 As Double Public BX1_12 As Double Public BX2_12 As Double Public A12 As Double ''' ''' Best prediction for Y only knowing X1 ''' ''' X1 ''' Best prediction for Y, using least-squares principle ''' Public Function PredictYFromX1(ByVal Xval As Double) As Double Return (B1 * Xval) + A1 End Function ''' ''' Best prediction for Y only knowing X2 ''' ''' X2 ''' Best prediction for Y, using least-squares principle ''' Public Function PredictYFromX2(ByVal X2val As Double) As Double Return (B2 * X2val) + A2 End Function ''' ''' Best prediction for Y knowing both X1 and X2 ''' ''' X1 ''' X2 ''' Simple multiple regression best prediction for Y, using least-squares principle ''' Public Function PredictYFromX1X2(ByVal X1val As Double, ByVal X2val As Double) As Double Return (BX1_12 * X1val) + (BX2_12 * X2val) + A12 End Function End Class Public Module SummaryStatsModule ''' ''' Calculates the expected values if we only knew X1, X2, ''' or knew both X1 and X2. Also calculates the residuals. ''' ''' Date and sales entry data structure ''' Public Sub InitResiduals(ByVal sd As CDataSortedByDate) Dim das As CDataEntry ' date and sales entry With sd.summaryStats For Each das In sd.rgDataSortedByDate das.dExpectedX1 = .PredictYFromX1(das.dX) das.dResidualX1 = das.dY - das.dExpectedX1 das.dExpectedX2 = .PredictYFromX2(das.dX2) das.dResidualX2 = das.dY - das.dExpectedX2 das.dExpectedX1X2 = .PredictYFromX1X2(das.dX, das.dX2) das.dResidualX1X2 = das.dY - das.dExpectedX1X2 Next End With End Sub ''' ''' Turn the passage of time into number of days elapsed, ''' transformed to change scale and ensure non-zero values ''' ''' The date ''' The oldest date in the dataset ''' An "X" value based on dt ''' Public Function GetXLinearFromDate(ByVal dt As DateTime, ByVal dtOldest As Date) As Double Dim ts As TimeSpan ts = dt.Date.Subtract(dtOldest) Return CDbl(CInt(ts.TotalDays) + 1) * 0.1 End Function ''' ''' Turn the passage of time into number of days elapsed, squared, ''' transformed to change scale and ensure non-zero values ''' ''' The date ''' The oldest date in the dataset ''' An "X" value (quadratic passage of time) based on dt ''' Public Function GetX1FromDate(ByVal dt As DateTime, ByVal dtOldest As Date) As Double Dim ts As TimeSpan ts = dt.Date.Subtract(dtOldest) Dim d As Double = CDbl(CInt(ts.TotalDays) + 1) Return d * d * 0.01 End Function ''' ''' Assigns a number to weekday vs. weekend, given the date ''' ''' The date ''' -1 for weekday, +2.5 for weekend ''' Public Function GetX2FromDate(ByVal dt As DateTime) As Double Const dWeekendVal As Double = 2.5 Const dWeekdayVal As Double = -1.0 Dim d As Double Select Case dt.DayOfWeek Case Is = DayOfWeek.Saturday d = dWeekendVal Case Is = DayOfWeek.Sunday d = dWeekendVal Case Else d = dWeekdayVal End Select Return d End Function End Module