#include "math.h" #include "malloc.h" #include "stdio.h" #include "stdlib.h" //Tree int **NodeList, ****Nijk, **ClassDistribute;//NodeList, Nijk, ClassDistribute are in the same size int NLStart, NLSize, NLSizeIncremental;//Dynamic memory allocation int NodeCount;//The number of nodes in the tree int *MultiChoices,*MultiChoiceAtts; //Parameters int Dim, NumClass, NumAttValue[54], MaxNumAttValue, WinSize, Nmin; double Delta, Tao; int **WindowData,*UsedAtts; void ExpandNodeList() { int oldNLSize,i,j,k; oldNLSize=NLSize; NLSize+=NLSizeIncremental; NodeList=(int **)realloc(NodeList, NLSize*sizeof(int *)); Nijk=(int ****)realloc(Nijk,NLSize*sizeof(int ***)); ClassDistribute=(int **)realloc(ClassDistribute,NLSize*sizeof(int *)); for(i=oldNLSize; i*MaxID) *MaxID=NodeList[NodePos][MaxNumAttValue+2]; if (NodeList[NodePos][MaxNumAttValue+1]>=0) WindowFindMaxID(NodeList[NodePos][MaxNumAttValue+1], Data, MaxID); if (NodeList[NodePos][0]>=0) { i=Data[NodeList[NodePos][0]+1]; WindowFindMaxID(NodeList[NodePos][i+1], Data, MaxID); } } void WindowAddData(int *Data, int MaxNodeID, int *WinIndex) { int i; for(i=0; i<=Dim; i++) WindowData[*WinIndex][i]=Data[i]; WindowData[*WinIndex][Dim+1]=MaxNodeID; (*WinIndex)++; if (*WinIndex>=WinSize) *WinIndex=0; } void WindowForgetData(int NodePos, int *Data, int MaxID) { int i; if (NodeList[NodePos][MaxNumAttValue+2]<=MaxID) { ClassDistribute[NodePos][Data[0]]--; for(i=1; i<=Dim; i++) Nijk[NodePos][Data[0]][i-1][Data[i]]--; } if (NodeList[NodePos][MaxNumAttValue+1]>=0) WindowForgetData(NodeList[NodePos][MaxNumAttValue+1],Data,MaxID); if (NodeList[NodePos][0]>=0) { i=Data[NodeList[NodePos][0]+1]; WindowForgetData(NodeList[NodePos][i+1],Data,MaxID); } } double lg2(double x) { return log(x)/log(2.0); } bool UsedAtt(int Att, int *UsedAttSet) { int i; for(i=1; i<=UsedAttSet[0]; i++) if (Att==UsedAttSet[i]) return 1; return 0; } int CheckMultiClass(int NodePos) { int i,MultiClass=0; for(i=0; i0) MultiClass++; return MultiClass; } double GetEntropyTotal(int NodePos) { double out = 0.0; double prob; int i, Total=0; for(i=0; i 0.0) out -= (prob * lg2(prob)); } return out; } int MajorityClass(int NodePos) { int i, MaxIndex,MaxValue; MaxIndex=0; MaxValue=ClassDistribute[NodePos][0]; for(i=1; iMaxValue) { MaxValue=ClassDistribute[NodePos][i]; MaxIndex=i; } return MaxIndex; } void InitializeNode(int NodePos, int StartTime) { int i,j,k; NodeList[NodePos][0]=-2; NodeList[NodePos][MaxNumAttValue+1]=-1; NodeCount++; NodeList[NodePos][MaxNumAttValue+2]=NodeCount; NodeList[NodePos][MaxNumAttValue+3]=StartTime; NodeList[NodePos][MaxNumAttValue+4]=0; for(i=0; i1)) { //Find the Best two attributes TotalNum=GetNodeData(NodePos); for(i=0; i0) { Prob=(double)Nijk[NodePos][k][i][j]/(double)TotalNumPerVal; Entropy-=(Prob*lg2(Prob)); } FinalEntropy+=((double)TotalNumPerVal/(double)TotalNum)*Entropy; } if (FinalEntropyhoeffdingBound) || ((First_Second<=hoeffdingBound)&&(hoeffdingBound=NLSize) ExpandNodeList(); NLStart=j+1; NodeList[NodePos][i]=j; InitializeNode(j,CurrentDataIndex); } } } } void AddDataToSubNode(int NodePos, int *Data, int *UsedAtts, int CurrentDataIndex) { int i; ClassDistribute[NodePos][Data[0]]++; for(i=0; i0) { i=Data[NodeList[NodePos][0]+1]; NodePos=NodeList[NodePos][i+1]; UsedAtts[0]++; UsedAtts[UsedAtts[0]]=NodeList[NodePos][0]; AddDataToSubNode(NodePos,Data,UsedAtts,CurrentDataIndex); UsedAtts[0]--; } } void DeleteNode(int NodePos, int LeftPos) { int i; if (NodeList[NodePos][0]>=0) for(i=1; i<=NumAttValue[NodeList[NodePos][0]]; i++) DeleteNode(NodeList[NodePos][i],-1); if (NodeList[NodePos][MaxNumAttValue+1]!=LeftPos) DeleteNode(NodeList[NodePos][MaxNumAttValue+1],LeftPos); else if ((LeftPos>=0) && (NodeList[LeftPos][MaxNumAttValue+1]>=0)) { DeleteNode(NodeList[LeftPos][MaxNumAttValue+1],-1); NodeList[LeftPos][MaxNumAttValue+1]=-1; } NodeList[NodePos][0]=-1; if (NodePos=0) { AttValue=Data[NodeList[RootNodePos][0]+1]; CurrentNodePos=NodeList[RootNodePos][AttValue+1]; } while (CurrentNodePos>=0) { MultiChoices[0]=1; MultiChoices[1]=CurrentNodePos; CurrentNodePos=NodeList[CurrentNodePos][MaxNumAttValue+1]; while (CurrentNodePos>=0) { MultiChoices[0]++; MultiChoices[MultiChoices[0]]=CurrentNodePos; CurrentNodePos=NodeList[CurrentNodePos][MaxNumAttValue+1]; } BestChoice=MultiChoices[1]; BestAccuracy=(double)NodeList[BestChoice][MaxNumAttValue+4]/ (double)GetNodeData(BestChoice); for(i=2; i<=MultiChoices[0]; i++) { tmpValue=(double)NodeList[MultiChoices[i]][MaxNumAttValue+4]/ (double)GetNodeData(MultiChoices[i]); if (tmpValue>BestAccuracy) { BestChoice=MultiChoices[i]; BestAccuracy=tmpValue; } } if ((NodeList[BestChoice][0]==-2)||(NodeList[BestChoice][0]==-3)) { PredictClass=MajorityClass(BestChoice); CurrentNodePos=-1; } else if (NodeList[BestChoice][0]>=0) { AttValue=Data[NodeList[BestChoice][0]+1]; CurrentNodePos=NodeList[BestChoice][AttValue+1]; } } if (Pos!=NULL) { if (PredictClass==0) (*Pos)++; else if (PredictClass==1) (*Neg)++; } if (PredictClass==Data[0]) return 1; else return 0; } void AddDataToNode(int NodePos, int *Data, int *UsedAtts, int TotalDataSeen) { int i,CurrentNodePos; //Add Data to current node ClassDistribute[NodePos][Data[0]]++; for(i=0; i=0) { AddDataToSubNode(NodeList[CurrentNodePos][MaxNumAttValue+1], Data, UsedAtts, TotalDataSeen); CurrentNodePos=NodeList[CurrentNodePos][MaxNumAttValue+1]; } //Predict the class label NodeList[NodePos][MaxNumAttValue+4]+=PredictOneTestData(NodePos, Data, NULL, NULL); CurrentNodePos=NodeList[NodePos][MaxNumAttValue+1]; while (CurrentNodePos>=0) { NodeList[CurrentNodePos][MaxNumAttValue+4]+= PredictOneTestData(CurrentNodePos, Data, NULL, NULL); CurrentNodePos=NodeList[CurrentNodePos][MaxNumAttValue+1]; } } double PredictTestDataSet(int **TestData, int NumTest, int *Pos, int *Neg) { int i,Sum=0; for(i=0; i1)) { //Find the Best two attributes *FirstDelta=1000; *FirstAtt=-1; TotalNum=GetNodeData(NodePos); for(i=0; i0) { Prob=(double)Nijk[NodePos][k][i][j]/(double)TotalNumPerVal; Entropy-=(Prob*lg2(Prob)); } FinalEntropy+=((double)TotalNumPerVal/(double)TotalNum)*Entropy; } if (FinalEntropy<*FirstDelta) { SecondDelta=*FirstDelta; SecondAtt=*FirstAtt; *FirstDelta=FinalEntropy; *FirstAtt=i; } else if (FinalEntropy=0) { MultiChoices[0]=1; MultiChoices[1]=NodePos; MultiChoiceAtts[0]=1; MultiChoiceAtts[1]=NodeList[NodePos][0]; CurrentNodePos=NodeList[NodePos][MaxNumAttValue+1]; while (CurrentNodePos>=0) { MultiChoices[0]++; MultiChoices[MultiChoices[0]]=CurrentNodePos; if (NodeList[CurrentNodePos][0]>=0) { MultiChoiceAtts[0]++; MultiChoiceAtts[MultiChoiceAtts[0]]=NodeList[CurrentNodePos][0]; } CurrentNodePos=NodeList[CurrentNodePos][MaxNumAttValue+1]; } BestChoice=MultiChoices[1]; BestAccuracy=(double)NodeList[BestChoice][MaxNumAttValue+4]/ (double)GetNodeData(BestChoice); for(i=2; i<=MultiChoices[0]; i++) { tmpValue=(double)NodeList[MultiChoices[i]][MaxNumAttValue+4]/ (double)GetNodeData(MultiChoices[i]); if (tmpValue>BestAccuracy) { BestChoice=MultiChoices[i]; BestAccuracy=tmpValue; } } } if (BestChoice>=0) for(l=1; l<=MultiChoices[0]; l++) { BestChoice=MultiChoices[l]; UsedAtts[0]++; UsedAtts[UsedAtts[0]]=NodeList[BestChoice][0]; First_Second=CheckSplitEntropyValidity(BestChoice, UsedAtts, &FirstAtt, &FirstDelta); UsedAtts[0]--; if ((First_Second>=0)&&(!UsedAtt(FirstAtt, MultiChoiceAtts))) FindAltTree=0; else FindAltTree=1; if (!FindAltTree) { Range=lg2(NumClass); NumDataSeen=GetNodeData(BestChoice); hoeffdingBound = sqrt((Range*Range*log(1/Delta))/(2*NumDataSeen)); EntropyTotal=GetEntropyTotal(BestChoice); if ((FirstDeltahoeffdingBound) || ((First_Second<=hoeffdingBound) && (hoeffdingBound=0.5*Tao)))) { CurrentNodePos=MultiChoices[MultiChoices[0]]; j=NLStart; while ((j=NLSize) ExpandNodeList(); NLStart=j+1; NodeList[CurrentNodePos][MaxNumAttValue+1]=j; InitializeNode(j,TotalDataSeen); NodeList[j][0]=FirstAtt; for(i=1; i<=NumAttValue[FirstAtt]; i++) { //Find a vacant position in NodeList k=NLStart; while ((k=NLSize) ExpandNodeList(); NLStart=k+1; NodeList[j][i]=k; InitializeNode(k,TotalDataSeen); } } } for(i=1; i<=NumAttValue[NodeList[BestChoice][0]]; i++) CheckSplitValidity(NodeList[BestChoice][i],UsedAtts,TotalDataSeen); } } void ClearTreeAccuracy(int NodePos) { int i; for(i=0; iCheckConceptDriftInterval)) CurrentAccuracy+=PredictOneTestData(0, TrainData[i-1], NULL, NULL); //Add an exmaple to tree CurrentNode=0; FoundNode=1; UsedAtts[0]=0; while ((FoundNode==1) && (NodeList[CurrentNode][0]!=-1)) { AddDataToNode(CurrentNode, TrainData[i-1], UsedAtts, i); if (NodeList[CurrentNode][0]<0)//no child node FoundNode=0; else { CurrentAtt=NodeList[CurrentNode][0]; CurrentNode=NodeList[CurrentNode][TrainData[i-1][CurrentAtt+1]+1]; UsedAtts[0]++; UsedAtts[UsedAtts[0]]=CurrentAtt; } } //Update the window if (WindowData[WinIndex][0]!=-1) WindowForgetData(0, WindowData[WinIndex], WindowData[WinIndex][Dim+1]); MaxNodeID=0; WindowFindMaxID(0, TrainData[i-1], &MaxNodeID); WindowAddData(TrainData[i-1], MaxNodeID, &WinIndex); //CheckSplitValidity: Check whether to grow alternate subtree if (!CheckConceptDrift) { if ((i>CheckConceptDriftInterval) && (i%CheckConceptDriftInterval == 0)) { if (OldAccuracy-CurrentAccuracy>=0.03*CheckConceptDriftInterval) { CheckConceptDrift=1; StartCheckConceptDriftPoint=i; OldAccuracy=CurrentAccuracy=0; ClearTreeAccuracy(0); } else { OldAccuracy=CurrentAccuracy; CurrentAccuracy=0; } } } else { tmpInt=i-StartCheckConceptDriftPoint; if (tmpInt<=20000) { if (tmpInt % CheckConceptDriftInterval == 0) { UsedAtts[0]=0; CheckSplitValidity(0, UsedAtts, i); } } else CheckConceptDrift=OldAccuracy=CurrentAccuracy=0; } if (i % 2000==0) { UsedAtts[0]=0; CheckSplitValidity(0, UsedAtts, i); } printf("TrainingExample-%d\n", i); //Use the test set to test the accuracy of the current tree, //and the test set is the future "NumText" examples in the current data stream if (i % TestInterval == 0) { NumPos[(i/TestInterval)-1]=0; NumNeg[(i/TestInterval)-1]=0; Accuracy[(i/TestInterval)-1]=PredictTestDataSet(&TrainData[i], NumTest, &NumPos[(i/TestInterval)-1], &NumNeg[(i/TestInterval)-1]); printf("Accuracy:%f\n",Accuracy[(i/TestInterval)-1]); } } fclose(fp); //Output the result double aveAccuracy=0; fp=fopen("Results.txt","w"); fprintf(fp,"Number of Node= %d \n", NodeCount); for(i=0; i<(NumTrain/TestInterval); i++) { fprintf(fp,"%d : %f %d %d\n", (i+1)*TestInterval, Accuracy[i], NumPos[i], NumNeg[i]); aveAccuracy+=Accuracy[i]; } aveAccuracy/=(double)(NumTrain/TestInterval); fprintf(fp,"Average Accuracy = %f\n\n",aveAccuracy); fclose(fp); //Free memories for(i=0; i

        
        

                        ɧbÉ«ÓûÍø