(* :Title: Data Manipulation *) (* :Author: Wolfram Research, Inc. *) (* :Summary: This package provides an extension of the list manipulation functions that are built-in to Mathematica. Additional functions useful for manipulating statistical data include frequency counting and computing cumulative sums. *) (* :Context: Statistics`DataManipulation` *) (* :Package Version: 1.4 *) (* :Copyright: Copyright 1990-2007, Wolfram Research, Inc. *) (* :History: 1.1: original version, 1990. 1.2: improved speed of Frequencies using V3.0 function Split (contribution of Pavel Zaruba), improved speed of BinCounts and RangeCounts, ECM, 1997. 1.3: improved efficiency for a number of functions, Darren Glosemeyer, 2004. 1.4 obsoleted BooleanSelect and CumulativeSums which are superceded by Pick and Accumulate, improved efficiency of Bin*, Range* and Category* functions, and added IntervalClosure option to BinCounts, RangeCounts, BinLists, and RangeLists, Darren Glosemeyer, 2005-2006. *) (* :Keywords: *) (* :Source: None. *) (* :Warning: Expands the definition of Column. *) (* :Mathematica Version: 6.0 *) (* :Limitation: None known. *) (* :Discussion: *) Message[General::obspkg,"Statistics`DataManipulation`"] BeginPackage["Statistics`DataManipulation`"] ColumnTake::usage = "ColumnTake[data, spec] takes the specified columns in data." ColumnDrop::usage = "ColumnDrop[data, spec] drops the specified columns in data." ColumnJoin::usage = "ColumnJoin[data1, data2, ...] joins elements in corresponding \ columns in the datai." RowJoin::usage = "RowJoin[data1, data2, ...] joins elements in corresponding rows \ in the datai." DropNonNumeric::usage = "DropNonNumeric[data] drops elements or rows that contain non-numeric \ elements in data." DropNonNumericColumn::usage = "DropNonNumericColumn[data] drops columns that contain non-numeric \ elements in data." BooleanSelect::usage = "BooleanSelect[list, sel] keeps elements of list for which the \ corresponding element of sel is True." QuantileForm::usage = "QuantileForm[list] sorts the elements in list, then gives \ a list of the elements, paired with their quantile position." CumulativeSums::usage = "CumulativeSums[list] gives cumulative sums of list." CategoryCounts::usage = "CategoryCounts[{x1, x2, ...}, {e1, e2, ...}] gives a list containing the \ number of elements in the data {x1, x2, ...} that match each of the ei. \ CategoryCounts[{x1, x2, ...}, {{e11, e12, ...}, {e21, e22, ...}, ...}] gives a \ list of the number of elements in the data {x1, x2, ...} that match any of the \ elements in each list {ei1, ei2, ...}. CategoryCounts[{{x1, y1}, {x2, y2}, \ ...}, {xe1, xe2, ...}, {ye1, ye2, ...}] and CategoryCounts[{{x1, y1}, \ {x2, y2}, ...}, {{xe11, xe12, ...}, {xe21, xe22, ...}, ...}, \ {{ye11, ye12, ...}, {ye21, ye22, ...}, ...}] both give 2-dimensional arrays of \ category counts for the bivariate data {{x1, y1}, {x2, y2}, ...}. In general, \ CategoryCounts gives a p-dimensional array of category counts for p-variate \ data." CategoryLists::usage = "CategoryLists[{x1, x2, ...}, {e1, e2, ...}] gives lists of the elements in the \ data {x1, x2, ...} that match each of the ei. CategoryLists[{x1, x2, ...}, \ {{e11, e12, ...}, {e21, e22, ...}, ...}] gives lists of the elements in the \ data {x1, x2, ...} that match any of the elements in each list {ei1, ei2, ...}. \ CategoryLists[{{x1, y1}, {x2, y2}, ...}, {xe1, xe2, ...}, {ye1, ye2, ...}] \ and CategoryLists[{{x1, y1}, {x2, y2}, ...}, {{xe11, xe12, ...}, \ {xe21, xe22, ...}, ...}, {{ye11, ye12, ...}, {ye21, ye22, ...}, ...}] both \ give 2-dimensional arrays of category lists for the bivariate data {{x1, y1}, \ {x2, y2}, ...}. In general, CategoryLists gives a p-dimensional array of \ category lists for p-variate data." Frequencies::usage = "Frequencies[list] gives a list of the distinct elements in list, \ together with the frequencies with which they occur." RangeCounts::usage = "RangeCounts[{x1, x2, ...}, {c1, c2, ..., cm}] gives a list of the number of \ elements in the data {x1, x2, ...} that lie between successive cutoffs. \ The range boundaries are {x < c1, c1 <= x < c2, ..., x >= cm}. \ RangeCounts[{{x1, y1}, {x2, y2}, ...}, {xc1, xc2, ..., xcm}, \ {yc1, yc2, ..., ycn}] gives a 2-dimensional array of range counts for the \ bivariate data {{x1, y1}, {x2, y2}, ...}. In general, RangeCounts gives a \ p-dimensional array of range counts for p-variate data." RangeLists::usage = "RangeLists[{x1, x2, ...}, {c1, c2, ..., cm}] gives lists of the elements in the \ data {x1, x2, ...} that lie between successive cutoffs. RangeLists[{{x1, y1}, \ {x2, y2}, ...}, {xc1, xc2, ...}, {yc1, yc2, ...}] gives a 2-dimensional array \ of range lists for the bivariate data {{x1, y1}, {x2, y2}, ...}. In general, \ RangeLists gives a p-dimensional array of range lists for p-variate data." Unprotect[Column,ColumnTake,ColumnDrop,ColumnJoin,RowJoin,DropNonNumeric, DropNonNumericColumn,BooleanSelect,Frequencies,QuantileForm,CumulativeSums,CategoryCounts, CategoryLists,RangeCounts,RangeLists]; Begin["`Private`"] issueObsoleteFunMessage[fun_, context_] :=Message[General::obspkgfn, fun, context] Column[data:{___List}, n_Integer] := ( issueObsoleteFunMessage[Column, "Statistics`DataManipulation`"]; data[[All,n]]) Column[data:{___List}, ni:{__Integer}] := ( issueObsoleteFunMessage[Column, "Statistics`DataManipulation`"]; data[[All,ni]]) ColumnTake[data:{___List}, spec_] := ( issueObsoleteFunMessage[ColumnTake, "Statistics`DataManipulation`"]; Take[data, All, spec]) ColumnDrop[data:{___List}, spec_] := ( issueObsoleteFunMessage[ColumnDrop, "Statistics`DataManipulation`"]; Drop[data, None, spec]) ColumnJoin[data:{___List}..] := ( issueObsoleteFunMessage[ColumnJoin, "Statistics`DataManipulation`"]; Join[data]) RowJoin[data:{___List}..] := ( issueObsoleteFunMessage[RowJoin, "Statistics`DataManipulation`"]; Apply[Join, Transpose[{data}], {1}]) /; Equal @@ Map[Length, {data}] DropNonNumeric[data_List] := ( issueObsoleteFunMessage[DropNonNumeric, "Statistics`DataManipulation`"]; Select[data, NumberQ[N[#]]&]) /; VectorQ[data] DropNonNumeric[data:{___List}] := ( issueObsoleteFunMessage[DropNonNumeric, "Statistics`DataManipulation`"]; Select[data, VectorQ[N[#],NumberQ]&]) DropNonNumericColumn[data:{___List}] := ( issueObsoleteFunMessage[DropNonNumericColumn, "Statistics`DataManipulation`"]; Module[{d = DropNonNumeric[Transpose[data]]}, If[d==={}, {}, Transpose[d] ] ])/;Apply[SameQ,Map[Length,data]] (* BooleanSelect is superceded by Pick in 5.1; issue a message on first usage to warn users *) BooleanSelect[list_List, sel_List] := ( issueObsoleteFunMessage[BooleanSelect, "Statistics`DataManipulation`"]; Pick[list,sel]) /; Length[list] == Length[sel] QuantileForm[list_List] := (issueObsoleteFunMessage[QuantileForm, "Statistics`DataManipulation`"]; Transpose[{Range[Length[list]]/Length[list], Sort[list]}]) /; VectorQ[list] (* VectorQ check because the sample Quantile is not easily defined for multidimensional data. *) (* CumulativeSums is superceded by Accumulate in 6.0; issue a message on first usage to warn users *) CumulativeSums[list_List] := ( issueObsoleteFunMessage[CumulativeSums, "Statistics`DataManipulation`"]; Accumulate[list]) (* ================================ Frequencies ============================ *) Frequencies[list_List] := (issueObsoleteFunMessage[Frequencies, "Statistics`DataManipulation`"]; {Length[#],First[#]}&/@Split[Sort[list]]) (* Pavel Zaruba's suggestion for versions 2.2 and earlier versions that do not support Split: Frequencies[list_List]:= Module[{freq,p,f}, freq={}; p=list; f=0; Scan[If[p=!=#,freq={freq,f};p=#;f=1,++f]&,Sort[list]]; Transpose[{Rest[Flatten[{freq, f}]],Union[list]}]] *) realQ[x_]:=(NumberQ[#]&&Im[#]===0)&[N[x]] realOrRealInfinityQ[x_]:=realQ[x]||SameQ[x,DirectedInfinity[1]]||SameQ[x,DirectedInfinity[-1]] validCutOffsQ[x_]:=VectorQ[x,realOrRealInfinityQ] RangeCounts[data_?VectorQ,clists_?validCutOffsQ]:=( issueObsoleteFunMessage[RangeCounts, "Statistics`DataManipulation`"]; Block[{res=BinCounts[data,{Flatten[{-Infinity,clists,Infinity}]},"BinClosure"->Left]}, res/;FreeQ[res,BinCounts]]) RangeCounts[data_?MatrixQ,clists__?validCutOffsQ]:=( issueObsoleteFunMessage[RangeCounts, "Statistics`DataManipulation`"]; Block[{res=Apply[BinCounts[data,##,"BinClosure"->Left]&, Map[{Flatten[{-Infinity,#,Infinity}]}&,{clists}]]}, res/;FreeQ[res,BinCounts]]) RangeLists[data_?VectorQ,clists_?validCutOffsQ]:=( issueObsoleteFunMessage[RangeLists, "Statistics`DataManipulation`"]; Block[{res=BinLists[data,{Flatten[{-Infinity,clists,Infinity}]},"BinClosure"->Left]}, res/;FreeQ[res,BinLists]]) RangeLists[data_?MatrixQ,clists__?validCutOffsQ]:=( issueObsoleteFunMessage[RangeLists, "Statistics`DataManipulation`"]; Block[{res=Apply[BinLists[data,##,"BinClosure"->Left]&, Map[{Flatten[{-Infinity,#,Infinity}]}&,{clists}]]}, res/;FreeQ[res,BinLists]]) (* =================== CategoryCounts and Categoryists ==================== *) CategoryCounts::dims = CategoryLists::dims = "The dimension `1` of the first argument is not the same as the number of category specifications `2`." CategoryCounts::lis = CategoryLists::lis ="The argument `1` at position `2` is expected to be a list." CategoryCounts[args___]:=( issueObsoleteFunMessage[CategoryCounts, "Statistics`DataManipulation`"]; Block[{res=iCCountList[{args},CategoryCounts,Count]}, res/;res=!=$Failed]) CategoryLists[args___]:=( issueObsoleteFunMessage[CategoryLists, "Statistics`DataManipulation`"]; Block[{res=iCCountList[{args},CategoryLists,Cases]}, res/;res=!=$Failed]) iCCountList[arglist_,caller_,fun_]:=Block[{arg1,restargs,vecQ,matQ,genmatQ}, If[ Not[ArgumentCountQ[caller,Length[arglist],2,Infinity]], Return[$Failed]]; arg1=arglist[[1]]; restargs=Rest[arglist]; If[Head[arg1]=!=List,Message[caller::lis,arg1,1];Return[$Failed]]; With[{pos=Position[restargs,_?(Head[#]=!=List&),{1},Heads->False]}, If[pos=!={}, Message[caller::lis, Extract[restargs,pos][[1]],pos[[1]]+1]; Return[$Failed] ]]; (* the data are categorical, so a matrix could have arbitrary elements *) vecQ=VectorQ[arg1]; matQ=If[vecQ,False,MatrixQ[arg1]]; genmatQ=If[vecQ||matQ,False,MatrixQ[arg1, (MatchQ[#, _] &)]]; Which[(vecQ||Not[genmatQ||matQ])&&Length[restargs]===1, listCCCL[arg1, restargs[[1]],fun], (matQ||genmatQ)&&Length[restargs]===Length[arg1[[1]]], matrixCCCL[arg1, restargs,fun], True, Message[caller::dims,If[matQ,Length[arg1[[1]]],1],Length[restargs]]; $Failed ] ] listCCCL[list_,clist_,fun_]:=Block[{newclist}, newclist=Map[ If[Head[#]===List, Apply[Alternatives, #],#]&,clist]; Map[fun[list, #]&, newclist] ] matrixCCCL[list_,clists_,fun_]:=Block[{newclist}, newclist=Map[ If[Head[#]===List, Apply[Alternatives, #],#]&,clists,{2}]; Map[fun[list, #]&, Apply[Outer[List, ##]&,newclist], {Length[clists]}] ] End[ ] SetAttributes[ Column ,ReadProtected]; SetAttributes[ ColumnTake ,ReadProtected]; SetAttributes[ ColumnDrop ,ReadProtected]; SetAttributes[ ColumnJoin ,ReadProtected]; SetAttributes[ RowJoin ,ReadProtected]; SetAttributes[ DropNonNumeric ,ReadProtected]; SetAttributes[ DropNonNumericColumn ,ReadProtected]; SetAttributes[ BooleanSelect, ReadProtected]; SetAttributes[ QuantileForm, ReadProtected]; SetAttributes[ CumulativeSums, ReadProtected]; SetAttributes[ CategoryCounts, ReadProtected]; SetAttributes[ CategoryLists, ReadProtected]; SetAttributes[ RangeCounts, ReadProtected]; SetAttributes[ RangeLists, ReadProtected]; Protect[Column,ColumnTake,ColumnDrop,ColumnJoin,RowJoin,DropNonNumeric, DropNonNumericColumn,BooleanSelect,Frequencies,QuantileForm,CumulativeSums,CategoryCounts, CategoryLists,RangeCounts,RangeLists]; EndPackage[ ]