标签:
杂谈 |
分类: 稚嫩痕迹 |
#include <stdio.h>
#include <math.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#define TRUE 1
#define FALSE 0
#define ExecTimes 100 //执行次数
int N,K;
int * CenterDataIndex; //初始化中心点在数据集中的序列值数组
double ** CenterData; //每个聚类的中心点数据 二维
double ** CenterDataCopy; //聚类中心点备份
double ** AllData; //输入的数据集,改为二维
double *** Cluster; //每个聚类的数据,三维数组
int * ElementNum; //每个聚类的数据个数数组
char Class[150][20]; //读到的类别名称
int
double FmeasureMax = 0, FmeasureMin = 1.1, FmeasureMid = 0;
long int RunTime; //总执行时间
int Exec = 0; //执行计数
double * BestClusterArray; //最佳F的聚类四维转一维 N*4
double * BestCenterArray; //最佳F的聚类中心四维转一维 K*4
int * BestClusterElement; //最佳F的各聚类成员个数 K
void WriteFresult(void);
int * CreateRandomArray(int n,int k,int * centerindex)
{
int i=0,j=0;
srand((unsigned)time(NULL) + Exec);
for(i=0;i<k;i++)
{
int randtheonly=TRUE;
while(randtheonly)
{
int a=rand()%n;
for(j=0;j<i;j++)
{
if(centerindex[j]==a) //重复
{
randtheonly=TRUE;
break;
}
}
if(j==i) //不重复
{
centerindex[i]=a;
randtheonly=FALSE;
}
}
}
return centerindex;
}
void CenterDataBackup() //聚类均值数组的备份
{
int i=0;
for(i=0;i<K;i++)
{
CenterDataCopy[i][0]=CenterData[i][0];
CenterDataCopy[i][1]=CenterData[i][1];
CenterDataCopy[i][2]=CenterData[i][2];
CenterDataCopy[i][3]=CenterData[i][3];
}
}
void InitCenter()
{
int i=0;
CenterDataIndex = CreateRandomArray(N,K,CenterDataIndex); //随机产生K个均值序列
for(i=0;i<K;i++)
{
CenterData[i][0]=AllData[CenterDataIndex[i]][0]; //将对应坐标赋值给中心点坐标
CenterData[i][1]=AllData[CenterDataIndex[i]][1];
CenterData[i][2]=AllData[CenterDataIndex[i]][2];
CenterData[i][3]=AllData[CenterDataIndex[i]][3];
}
CenterDataBackup(); //均值备份
}
int GetIndex(double value[4],double ** ave)
{
int i=0;
int index=i; //距离最小的聚类索引
double min=sqrt( pow((value[0] - ave[i][0]), 2.0) +
pow((value[1] - ave[i][1]), 2.0) +
for(i=0;i<K;i++) //找到距离最小的聚类索引
if(sqrt( pow((value[0] - ave[i][0]), 2.0) + pow((value[1] -
ave[i][1]), 2.0) +
pow((value[2] - ave[i][2]), 2.0) + pow((value[3] - ave[i][3]), 2.0) )<min)
{
index=i;
min=sqrt( pow((value[0] - ave[i][0]), 2.0) + pow((value[1] -
ave[i][1]), 2.0) +
}
}
return index;
}
void AddToCluster(int index,double value[4],int i) //加入一个数据到得到的合适聚类中
{
Cluster[index][ElementNum[index]][0]=value[0];
Cluster[index][ElementNum[index]][1]=value[1];
Cluster[index][ElementNum[index]][2]=value[2];
Cluster[index][ElementNum[index]][3]=value[3];
Cluster[index][ElementNum[index]][4]=i;
ElementNum[index]++;
}
void UpdateCluster() //更新聚类成员
{
int i=0;
int temindex;
for(i=0;i<K;i++) //清空所有聚类元素个数
{
ElementNum[i]=0;
}
for(i=0;i<N;i++)
{
temindex=GetIndex(AllData[i],CenterData); //得到与当前数据相差最小的聚类索引
AddToCluster(temindex,AllData[i],i); //加入到相应的聚类
}
}
void InitData() //数据初始化
{
char * ReadFileName="./data set/iris.data";
FILE * RDF;
int i=0,j=0;
N=0;
double DataRead[4];
char Comma;
char TemClass[20];
if((RDF=fopen(ReadFileName,"r"))==NULL)
{
printf("Data set file not exist!\n");
exit(0);
}
while(!feof(RDF))
{
fscanf(RDF, "%c", &Comma);
fscanf(RDF, "%lf", &DataRead[1]);
fscanf(RDF, "%c", &Comma);
fscanf(RDF, "%lf", &DataRead[2]);
fscanf(RDF, "%c", &Comma);
fscanf(RDF, "%lf", &DataRead[3]);
fscanf(RDF, "%c", &Comma);
fgets(TemClass, 20, RDF); //读取20个字符内的字符串,超出20个则忽略20个之后字符
N=N+1;
}
N-=1;
fclose(RDF);
K = 3;
ElementNum=(int *)malloc(sizeof(int)*K);
CenterDataIndex=(int *)malloc(sizeof(int)*K);
CenterData=(double **)malloc(sizeof(double)*K);
CenterDataCopy=(double **)malloc(sizeof(double)*K);
AllData=(double **)malloc(sizeof(double)*N);
Cluster=(double ***)malloc(sizeof(double)*K);
BestClusterArray = (double *)malloc(sizeof(double)*(5*N));
BestCenterArray = (double *)malloc(sizeof(double)*(4*K));
BestClusterElement = (int *)malloc(sizeof(int)*K);
for(i=0;i<K;i++)
{
CenterData[i]=(double *)malloc(sizeof(double)*4);
CenterDataCopy[i]=(double *)malloc(sizeof(double)*4);
Cluster[i]=(double **)malloc(sizeof(double)*N);
for(j=0;j<N;j++)
{
Cluster[i][j]=(double *)malloc(sizeof(double)*5);
}
ElementNum[i]=0; //初始每个聚类的元素个数为0
}
for(i=0;i<N;i++)
{
AllData[i]=(double *)malloc(sizeof(double)*4);
}
i=0;
RDF=fopen(ReadFileName,"r");
while(!feof(RDF)) //从文件读数据
{
fscanf(RDF, "%c", &Comma);
fscanf(RDF, "%lf", &DataRead[1]);
fscanf(RDF, "%c", &Comma);
fscanf(RDF, "%lf", &DataRead[2]);
fscanf(RDF, "%c", &Comma);
fscanf(RDF, "%lf", &DataRead[3]);
fscanf(RDF, "%c", &Comma);
fgets(Class[i],20,RDF);
AllData[i][0]=DataRead[0];
AllData[i][1]=DataRead[1];
AllData[i][2]=DataRead[2];
AllData[i][3]=DataRead[3];
if(strcmp(Class[i], "Iris-setosa\n") == 0)
else if(strcmp(Class[i], "Iris-versicolor\n") == 0)
else if(strcmp(Class[i], "Iris-virginica\n") == 0) Iris_virginica++;
i++;
}
fclose(RDF);
}
void UpdateCenterData() //添加元素后更新聚类均值
{
int i=0,j=0;
double sumx=0;
double sumy=0;
double sumz=0;
double sumt=0;
for(i=0;i<K;i++) //计算每个聚类的均值
{
sumx=0;
sumy=0;
sumz=0;
sumt=0;
for(j=0;j<ElementNum[i];j++) //计算某聚类的元素值和
{
sumx+=Cluster[i][j][0];
sumy+=Cluster[i][j][1];
sumz+=Cluster[i][j][2];
sumt+=Cluster[i][j][3];
}
if(ElementNum[i]>0)
{
CenterData[i][0]=sumx/ElementNum[i];
CenterData[i][1]=sumy/ElementNum[i];
CenterData[i][2]=sumz/ElementNum[i];
CenterData[i][3]=sumt/ElementNum[i];
}
}
}
int EqualJudge(double ** center1,double ** center2) //比较前后两次聚类均值是否相同
{
int i;
for(i=0;i<K;i++)
{
if(fabs( (center1[i][0]!=center2[i][0]) || (center1[i][1]!=center2[i][1]) || (center1[i][2]!=center2[i][2]) || (center1[i][3]!=center2[i][3]) ))
{
return FALSE;
}
}
return TRUE;
}
double F_measure()
{
int i,j,k;
int setosa=0,versicolor=0,virginica=0;
int MaxClass[3],TemClassNum[3];
double P[3], R[3], F[3];
double fmeasure;
for(i=0;i<K;i++)
{
setosa=0,versicolor=0,virginica=0;
for(j=0;j<ElementNum[i];j++)
{
for(k = 0; k < N; k++)
{
if(Cluster[i][j][4] == (double)k)
{
if(strcmp(Class[k], "Iris-setosa\n") == 0)
else if(strcmp(Class[k], "Iris-versicolor\n") == 0)
else if(strcmp(Class[k], "Iris-virginica\n") == 0) virginica++;
break;
}
}
}
if(setosa > versicolor)
{
if(setosa > virginica)
{
MaxClass[i] = setosa;
TemClassNum[i] = Iris_setosa;
}
else
{
MaxClass[i] = virginica;
TemClassNum[i] = Iris_virginica;
}
}
else
{
if(versicolor > virginica)
{
MaxClass[i] = versicolor;
TemClassNum[i] = Iris_versicolor;
}
else
{
MaxClass[i] = virginica;
TemClassNum[i] = Iris_virginica;
}
}
if(TemClassNum[i]!=0) P[i] = ((double)MaxClass[i])/((double)TemClassNum[i]);
if(ElementNum[i]!=0)
if((P[i]+R[i])!=0)
}
fmeasure = ((double)(TemClassNum[0]*F[0]+TemClassNum[1]*F[1]+TemClassNum[2]*F[2])) / ((double)(TemClassNum[0]+TemClassNum[1]+TemClassNum[2]));
return fmeasure;
}
void WriteToFile() //写入文件
{
int i = 0, j = 0, k = 0,t = 0;
char * WriteFileName="ResultFile.txt";
FILE * WDF;
if((WDF=fopen(WriteFileName,"w"))==NULL)
{
printf("ResultFile not exist!\n");
exit(0);
}
fprintf(WDF,"Data Set Name: Iris , Attributes Number = 4 ,
Instances Number = %d ,
for(i=0;i<K;i++)
{
fprintf(WDF,"Group %d, The Number of Cluster Members: %d , Center: (%lf, %lf, %lf, %lf)\n",i+1,BestClusterElement[i], BestCenterArray[i*4],BestCenterArray[i*4+1],BestCenterArray[i*4+2],BestCenterArray[i*4+3]);
for(j=0;j<BestClusterElement[i];j++)
{
fprintf(WDF, "%.1lf, %.1lf, %.1lf, %.1lf, ",BestClusterArray[t],BestClusterArray[t+1],BestClusterArray[t+2],BestClusterArray[t+3]);
for(k = 0; k < N; k++)
{
if(BestClusterArray[t+4] == (double)k)
{
fprintf(WDF, "%s", Class[k]);
break;
}
}
t += 5;
}
fprintf(WDF, "***********************************\n");
}
fprintf(WDF, "\nThe F-measure Results of Executting %d Times:\n", ExecTimes);
fprintf(WDF, "Max Min F-average\n");
fprintf(WDF, "%lf %lf %lf\n", FmeasureMax, FmeasureMin, FmeasureMid);
fprintf(WDF, "Time-consuming: %lf s\n", ((double)RunTime)/1000000); //按秒输出
fclose(WDF);
WriteFresult();
}
void WriteFresult()
{
char * WriteFresult="Fresult.txt";
FILE * WF;
if((WF=fopen(WriteFresult,"w"))==NULL)
{
printf("Fresult.txt not exist!\n");
exit(0);
}
fprintf(WF,"Data Set Name: Iris , Attributes Number = 4 ,
Instances Number = %d ,
fprintf(WF, "\nThe F-measure Results of Executting %d Times:\n", ExecTimes);
fprintf(WF, "Max Min F-average\n");
fprintf(WF, "%lf %lf %lf\n", FmeasureMax, FmeasureMin, FmeasureMid);
fprintf(WF, "Time-consuming: %lf s\n", ((double)RunTime)/1000000); //按秒输出
fclose(WF);
}
int main()
{
struct timeval StartTime,EndTime;
gettimeofday(&StartTime,NULL);
int Flag=1;
double FmeasureArray[ExecTimes];
double sumfmeasure = 0;
int i=0, j=0, k=0, t1=0, t2=0;
InitData(); //初始化数据
for(Exec=0; Exec<ExecTimes;Exec++)
{
Flag = 1;
InitCenter(); //K个聚类的均值初始化
while(Flag) //循环寻找最佳聚类
{
UpdateCluster(); //更新各个聚类
UpdateCenterData(); //更新各个聚类均值
if(EqualJudge(CenterData,CenterDataCopy)) //如果本次聚类各均值与上次相同,停止聚类
{
Flag=0;
}
else
{
CenterDataBackup(); //如不相同,则将新聚类备份
}
}
FmeasureArray[Exec] = F_measure(); //计算F-measure的最大值,最小值和平均值
sumfmeasure += FmeasureArray[Exec];
if(FmeasureMin > FmeasureArray[Exec])
{
FmeasureMin = FmeasureArray[Exec];
}
if(FmeasureMax < FmeasureArray[Exec])
{
FmeasureMax = FmeasureArray[Exec];
t1 = 0;
t2 = 0;
for(i=0;i<K;i++)
{
BestClusterElement[i] = ElementNum[i];
for(j=0; j < 4; j++)
{
BestCenterArray[t1++] = CenterData[i][j];
}
for(j=0;j<ElementNum[i];j++)
{
for(k=0; k<5; k++)
{
BestClusterArray[t2++] = Cluster[i][j][k];
}
}
}
}
}
FmeasureMid = sumfmeasure/ExecTimes;
gettimeofday(&EndTime, NULL);
RunTime = 1000000*(EndTime.tv_sec - StartTime.tv_sec) + EndTime.tv_usec - StartTime.tv_usec;
WriteToFile(); //结果写入文件
return 0;
}

加载中…