Using statistical models to predict future sales

Recommended Statistics Software


Basic Correlation, Multiple Regression (2-IVs) Source Code
 Home Online Tools Resources Log In 
Weather, Sales Trend demand forecasting
The following source code was written by Owen Emlen for BrainTech, LLC.  It can be compiled using Visual Basic.NET.  The formulas and logic for calculating Pearson's r, partial and semi-partial correlations, beta weights, an intercept, and big R-squared for a 2-IV multiple regression equation are well commented and documented below.  This source code is also available in its original (text) format.  NEW: Additional (Free) source code written by Owen Emlen for learning purposes / idea exchange / free use (Released 01/13/2007): Introduction To Programming Using Visual Basic.net
Introduction To Programming Using Visual Basic.net

Imports System.Math
Imports System.Collections.Generic

''' Basic Stats Class to keep track of summary statistics
Public NotInheritable Class CBasicSummaryStats
   Public dMean As Double
   Public dStdDev As Double
   Public dSum As Double
   Public dMeanSq As Double
   Public dVariance As Double
End Class

Public NotInheritable Class cDataSortedByDate
   Public rgDataSortedByDate As New List(Of CDataEntry)(128)
   Public rgDataLookupByDate As New Dictionary(Of DateTime, CDataEntry)(128) 
   Public dtFirstDataEntry As DateTime
   Public dtLastDataEntry As DateTime
   Public summaryStats As CSummaryStats

   Public Sub CalculateAllCorrelationCoeffs(ByRef meansd As CSummaryStats)

      ' Pearson's r =            N * sum(XY) - sum(X)sum(Y)
      '                       -------------------------------------------------------------
      '               sqrt((N*sum(X^2)) - sum(X)^2) * sqrt((N*sum(Y^2)) - sum(Y)^2)

      Dim Numerator As Double
      Dim Denominator As Double
      Dim Denom1 As Double
      Dim Denom2 As Double
      
      With meansd
         Numerator = (.dN * .dSumX1Y) - (.X1Stats.dSum * .YStats.dSum)
         Denom1 = (.dN * .dSumX1Sq) - (.X1Stats.dSum * .X1Stats.dSum)
         Denom2 = (.dN * .dSumYSq) - (.YStats.dSum * .YStats.dSum)
         Denominator = Sqrt(Denom1) * Sqrt(Denom2)
         .RYX1 = Numerator / Denominator

         Numerator = (.dN * .dSumX2Y) - (.X2Stats.dSum * .YStats.dSum)
         Denom1 = (.dN * .dSumX2Sq) - (.X2Stats.dSum * .X2Stats.dSum)
         Denom2 = (.dN * .dSumYSq) - (.YStats.dSum * .YStats.dSum)
         Denominator = Sqrt(Denom1) * Sqrt(Denom2)
         .RYX2 = Numerator / Denominator

         Numerator = (.dN * .dSumX1X2) - (.X1Stats.dSum * .X2Stats.dSum)
         Denom1 = (.dN * .dSumX1Sq) - (.X1Stats.dSum * .X1Stats.dSum)
         Denom2 = (.dN * .dSumX2Sq) - (.X2Stats.dSum * .X2Stats.dSum)
         Denominator = Sqrt(Denom1) * Sqrt(Denom2)
         .RX1X2 = Numerator / Denominator

         ' Now calculate partial correlations
         ' rX1Y·X2 =           rX1Y - (rX1X2)(rYX2)
         '                           ------------------------------
         '                     sqrt[1 - rX1X2^2] * sqrt[1 - rYX2^2]

         ' rX1,Y . X2

         Numerator = .RYX1 - (.RX1X2 * .RYX2)

         Denom1 = (1 - (.RX1X2 * .RX1X2))
         Denom2 = (1 - (.RYX2 * .RYX2))
         Denominator = Sqrt(Denom1) * Sqrt(Denom2)
         .PartialRX1_Y_X2 = Numerator / Denominator

         ' rX2,Y . X1

         Numerator = .RYX2 - (.RX1X2 * .RYX1)
         Denom1 = (1 - (.RX1X2 * .RX1X2))
         Denom2 = (1 - (.RYX1 * .RYX1))
         Denominator = Sqrt(Denom1) * Sqrt(Denom2)

         .PartialRX2_Y_X1 = Numerator / Denominator

         ' Semipartial correlations are next.  These are
         ' used to calculate the beta weight of each 
         ' independent variables in the multiple regression equation.

         ' rY·X2(X1) =   rYX2 - (rYX1)(rX1X2)
         '                     ------------------------------
         '                         sqrt[1 - rX1X2^2] 

         ' rY·X1(X2)
         Numerator = .RYX1 - (.RYX2 * .RX1X2)
         Denom1 = (1 - (.RX1X2 * .RX1X2))
         Denominator = Sqrt(Denom1)
         .SemiPartialRY_X1X2 = Numerator / Denominator

         ' rY·X2(X1)
         Numerator = .RYX2 - (.RYX1 * .RX1X2)
         Denom1 = (1 - (.RX1X2 * .RX1X2))
         Denominator = Sqrt(Denom1)

         .SemiPartialRY_X2X1 = Numerator / Denominator

         .BetaX1 = .SemiPartialRY_X1X2 * Sqrt(1 - (.RX1X2 * .RX1X2))
         .BetaX2 = .SemiPartialRY_X2X1 * Sqrt(1 - (.RX1X2 * .RX1X2))

      End With
   End Sub

   Public Sub FindDate(ByVal dt As DateTime) As CDataEntry
      If rgDataLookupByDate.ContainsKey(dt) = True Then
         Return rgDataLookupByDate(dt)
      Else
         Return Nothing
      End If
   End Function

   Public Sub CalculateSummaryStatistics()
      summaryStats = New CSummaryStats
      Dim nCnt As Integer = 0
      Dim das As CDataEntry

      nCnt = rgDataSortedByDate.Count
      summaryStats.dN = CDbl(nCnt)

      Dim yValue As Double
      Dim x1Value As Double
      Dim x2Value As Double
      Dim xLinear As Double
      Dim xSQ As Double
      Dim x2SQ As Double
      Dim ySQ As Double
      Dim dtMinDt As DateTime = DateTime.MaxValue
      Dim fFirst As Boolean = True

      With summaryStats
         For Each das In rgDataSortedByDate
            If fFirst = True Then
               dtMinDt = das.dt
               fFirst = False
            End If

            ' Linear trend
            ' x1Value = CDbl(CInt(ts.TotalDays))
            ' Let's look for a quadratic trend...
            ' Could also look for a logarithmic trend...
            ' x1Value = Log(x1Value)
            Dim ts As TimeSpan
            ts = das.dt.Date.Subtract(dtMinDt)

            ' Note: we are not using the GetX1FromDate function
            ' but we could be - it's a simple linear transformation
            ' but the below code makes it explicit what x1 represents
            xLinear = ts.TotalDays() + 1
            x1Value = xLinear * xLinear

            ' The number of weekend days and weekday days is not normally distributed and
            ' is categorical (nominal) in nature - using point biserial with categories
            ' "weekend" and "weekday" with probabilities
            ' 2/7 and 5/7 for (p) and (1-p) would work. Instead, let's keep it simple and use
            ' Pearson's r. Weekends will have a value of 2.5 and weekdays a value of -1.
            x2Value = GetX2FromDate(das.dt)

            das.dXLinear = xLinear
            das.dX = x1Value
            das.dX2 = x2Value
            yValue = das.dY

            .X1Stats.dSum += x1Value
            .X2Stats.dSum += x2Value
            .YStats.dSum += yValue

            .dSumX1Y += x1Value * yValue
            .dSumX2Y += x2Value * yValue

            xSQ = x1Value * x1Value
            x2SQ = x2Value * x2Value
            ySQ = yValue * yValue

            .dSumX1Sq += xSQ
            .dSumX2Sq += x2SQ
            .dSumYSq += ySQ

            .dSumX1X2 += x1Value * x2Value
         Next

         .X1Stats.dMean = summaryStats.X1Stats.dSum / summaryStats.dN
         .X2Stats.dMean = summaryStats.X2Stats.dSum / summaryStats.dN
         .YStats.dMean = summaryStats.YStats.dSum / summaryStats.dN
         .X1Stats.dMeanSq = .X1Stats.dMean * .X1Stats.dMean
         .X2Stats.dMeanSq = .X2Stats.dMean * .X2Stats.dMean
         .YStats.dMeanSq = .YStats.dMean * .YStats.dMean
         .X1Stats.dVariance = (.dSumX1Sq / .dN) - .X1Stats.dMeanSq
         .X2Stats.dVariance = (.dSumX2Sq / .dN) - .X2Stats.dMeanSq
         .YStats.dVariance = (.dSumYSq / .dN) - .YStats.dMeanSq

         If .X1Stats.dVariance <= 0 Then
            ' No variability, std dev is 0
            .X1Stats.dStdDev = 0
         Else
            .X1Stats.dStdDev = Math.Sqrt(.X1Stats.dVariance)
         End If

         If .X2Stats.dVariance <= 0 Then
            ' No variability, std dev is 0
            .X2Stats.dStdDev = 0
         Else
            .X2Stats.dStdDev = Math.Sqrt(.X2Stats.dVariance)
         End If

         If .YStats.dVariance <= 0 Then
            ' No variability, std dev is 0
            .YStats.dStdDev = 0
         Else
            .YStats.dStdDev = Math.Sqrt(.YStats.dVariance)
         End If
      End With


      ' Calculate the correlation between variables X1, X2, Y
      CalculateAllCorrelationCoeffs(summaryStats)

      ' Now that we have our r's, we can calculate Betas and Intercepts
      ' There you have it... the basics of multiple regression in Visual Basic
      ' (Thanks, Kristi for making this so simple to code up... no textbooks needed)

      With summaryStats
         .B1 = .RYX1 * (.YStats.dStdDev / .X1Stats.dStdDev)
         .A1 = .YStats.dMean - (.B1 * .X1Stats.dMean)
         .B2 = .RYX2 * (.YStats.dStdDev / .X2Stats.dStdDev)
         .A2 = .YStats.dMean - (.B2 * .X2Stats.dMean)

         .BX1_12 = .BetaX1 * (.YStats.dStdDev / .X1Stats.dStdDev)
         .BX2_12 = .BetaX2 * (.YStats.dStdDev / .X2Stats.dStdDev)
         .A12 = .YStats.dMean - (.BX1_12 * .X1Stats.dMean) - (.BX2_12 * .X2Stats.dMean)
      End With

   End Sub

End Class


''' Basic summary stats for X1 (time), X2 (weekend/weekday) and
''' Y (# of sales, our Dependent Variable (DV))
''' Also keeps track of sum of X1*Y, Sum X1^2, etc
''' for calculating correlation coefficients
Public NotInheritable Class CSummaryStats
   Public dN As Double

   Public X1Stats As New CBasicSummaryStats
   Public X2Stats As New CBasicSummaryStats
   Public YStats As New CBasicSummaryStats

   Public dSumX1Y As Double
   Public dSumX1Sq As Double
   Public dSumX2Y As Double
   Public dSumX2Sq As Double
   Public dSumYSq As Double
   Public dSumX1X2 As Double

   Public RYX1 As Double
   Public RYX2 As Double
   Public RX1X2 As Double

   Public PartialRX1_Y_X2 As Double
   Public PartialRX2_Y_X1 As Double

   ' The linear model will fit best if the date/sales trend is linear
   Public SemiPartialRY_X1X2 As Double
   Public SemiPartialRY_X2X1 As Double

   ' The interaction model tends to fit best if the date/sales trend is quadratic
   Public BetaX1 As Double
   Public BetaX2 As Double
   Public B1 As Double
   Public A1 As Double
   Public B2 As Double
   Public A2 As Double
   Public BX1_12 As Double
   Public BX2_12 As Double
   Public A12 As Double

   ''' Best estimate for Y only knowing X1
   Public Sub PredictYFromX1(ByVal Xval As Double) As Double
      Return (B1 * Xval) + A1
   End Function

   ''' Best estimate for Y only knowing X2
   Public Sub PredictYFromX2(ByVal X2val As Double) As Double
      Return (B2 * X2val) + A2
   End Function

   ''' Best estimate for Y knowing X1 and X2
   Public Sub PredictYFromX1X2(ByVal X1val As Double, ByVal X2val As Double) As Double
      Return (BX1_12 * X1val) + (BX2_12 * X2val) + A12
   End Function

End Class


Public Module SummaryStatsModule

   ''' Calculates the expected values if we only knew X1, X2,
   ''' or knew both X1 and X2. Also calculates the residuals.
   ''' param name="sd" : Date and sales entry data structure
   Public Sub InitResiduals(ByVal sd As cDataSortedByDate)
      Dim das As CDataEntry   ' date and sales entry

      With sd.summaryStats
         For Each das In sd.rgDataSortedByDate
            das.dExpectedX1 = .PredictYFromX1(das.dX)
            das.dResidualX1 = das.dY - das.dExpectedX1
            das.dExpectedX2 = .PredictYFromX2(das.dX2)
            das.dResidualX2 = das.dY - das.dExpectedX2
            das.dExpectedX1X2 = .PredictYFromX1X2(das.dX, das.dX2)
            das.dResidualX1X2 = das.dY - das.dExpectedX1X2
         Next
      End With
   End Sub

   ''' Turn the passage of time into number of days elapsed,
   ''' transformed to change scale and ensure non-zero values
   ''' param name="dt" : The date
   ''' param name="dtOldest" : The oldest date in the dataset
   ''' returns an "X" value based on dt
   Public Sub GetXLinearFromDate(ByVal dt As DateTime, ByVal dtOldest As Date) As Double
      Dim ts As TimeSpan
      ts = dt.Date.Subtract(dtOldest)
      Return CDbl(CInt(ts.TotalDays) + 1) * 0.1
   End Function

   ''' Turn the passage of time into number of days elapsed, squared,
   ''' transformed to change scale and ensure non-zero values
   ''' param name="dt" : The date
   ''' param name="dtOldest" : The oldest date in the dataset
   ''' returns an "X" value (quadratic passage of time) based on dt
   Public Sub GetX1FromDate(ByVal dt As DateTime, ByVal dtOldest As Date) As Double
      Dim ts As TimeSpan
      ts = dt.Date.Subtract(dtOldest)
      Dim d As Double = CDbl(CInt(ts.TotalDays) + 1)
      Return d * d * 0.01
   End Function


   ''' Assigns a number to weekday vs. weekend, given the date
   ''' param name="dt" : The date
   ''' returns -1 for weekday, +2.5 for weekend
   Public Sub GetX2FromDate(ByVal dt As DateTime) As Double
      Const dWeekendVal As Double = 2.5
      Const dWeekdayVal As Double = -1.0
      Dim d As Double

      Select Case dt.DayOfWeek
         Case Is = DayOfWeek.Saturday
            d = dWeekendVal
         Case Is = DayOfWeek.Sunday
            d = dWeekendVal
         Case Else
            d = dWeekdayVal
      End Select

      Return d
   End Function
End Module

Public NotInheritable Class CDataEntry : Implements IComparable
   Public dt As DateTime
   Public dX As Double      ' stores the date as a deviation from the start date (quadratic)
   Public dX2 As Double      ' stores if it is a weekend or not
   Public dXLinear As Double    ' stores the date as a deviation from the start date
   ' stores the squared date as a deviation from the start date
   ' there may be nonlinear, polynomial, or logarithmic trends
   ' as the season progresses

   Public dY As Double   ' This is considered "Y"
   Public dResidualX1 As Double
   Public dResidualX2 As Double
   Public dResidualX1X2 As Double
   Public dExpectedX1 As Double
   Public dExpectedX2 As Double
   Public dExpectedX1X2 As Double

   ' This is risky, but done for performance reasons, and because
   ' the code is only used internally at the moment -
   ' please don't be comparing CDataEntries to anything
   ' other than other CDataEntries!
   ' This is used for sorting by date only 

   Public Sub CompareTo(ByVal obj As Object) As Integer _
      Implements System.IComparable.CompareTo
      Dim dsCompareTo As CDataEntry = DirectCast(obj, CDataEntry)
      If Me.dt > dsCompareTo.dt Then Return 1
      If Me.dt < dsCompareTo.dt Then Return -1
      Return 0
   End Function

End Class

Home Page 
Try it Now
Account
Services
Weather
Resources
Contact Us
Free business advertising Home | Privacy Policy | Contact Us  
Copyright © 2006 BrainTech, LLC. All Rights Reserved