#!/usr/bin/nawk -f # # --Preprocessor program to generate box plots for GRACE-- # # By default, the program creates Tukey's "schematic plot" # Optionally, the program can create a box graph with whiskers # drawn to the 10th & 90th percentiles. # In either case, the statistics are computed and the boxes drawn # as described by Helsel & Hirsch, 1992, pages 24-26,451-455 # # Original program used as front-end to g2: # j.miller 5/4/94 version 1.00 # j.miller 5/17/94 version 1.02 (unixsort added and improved RE for numbers) # (misc comments corrected) # Revisions: # Stewart Rounds 5/31/95 modified to work with xmgr instead of g2 # U.S. Geological Survey # Portland, Oregon, USA # # Stewart Rounds 3/6/96 fixed bug in unixsort routine # Stewart Rounds 12/31/96 fix to allow numbers to be recognized as such # when used as box labels # Stewart Rounds 2/7/2000 modified to work with GRACE rather than xmgr # # This script expects an input file that contains two fields delimited with # one or more spaces or single tabs. These can be simple ASCII files or rdb # tables. The first field is the box name (variable that subsets the data). # The box name cannot start with a "#" symbol...these are considered comment # lines (as are used in rdb tables). The box name can be a string or a numeric. # The second field is the data to be evaluated for plotting boxes. # The data MUST BE SORTED by box name (this can be done with the unix # sort command). # # syntax: # grbox DATA.FILE [prefix=NAME] [type=1090] # [yaxis=LOG] [label=number] [flags=gr_options] # # example command lines: # grbox DATA.FILE # where DATA.FILE is the input file name # In this case a default boxplot (Tukey's schematic plot) is drawn. # Output is to an grace project file named "box.gr" # # grbox DATA.FILE prefix=NAME # where DATA.FILE is the input file name and prefix=NAME provides a # prefix string to be used on output. # In this case a default boxplot (Tukey's schematic plot) is drawn # and the prefix string will be used to name the output file # "NAME.gr". # This option allows multiple runs of the grbox script without # overwriting the output file. # # grbox DATA.FILE type=1090 # where DATA.FILE is the input file name and type=1090 changes the # default boxplot to a box graph with whiskers drawn to the 10th and # 90th percentiles. # Output is to an grace project file named "box.gr" # # grbox DATA.FILE type=1090 prefix=NAME # where DATA.FILE is the input file name and type=1090 changes the # default boxplot to a box graph with whiskers drawn to the 10th and # 90th percentiles. Prefix=NAME provides a prefix string causing # the output file to be named "NAME.gr" instead of "box.gr" # # grbox DATA.FILE yaxis=LOG # where DATA.FILE is the input file name and yaxis=LOG changes the # grace y-axis default from linear to logarithmic. # Output is to an grace project file named "box.gr" # # grbox DATA.FILE label=number # where DATA.FILE is the input file name and label=number causes # grace to recognize the box labels as numbers rather than just labels. # Output is to an grace project file named "box.gr" # # grbox DATA.FILE flags="-hardcopy -hdevice PostScript" # where DATA.FILE is the input file name # In this case a default boxplot (Tukey's schematic plot) is drawn. # Output is to an grace project file named "box.gr" # The flags option allows command line options to be passed to the # session of grace that this script will spawn. In this case, the flags # -hardcopy and -hdevice PostScript will cause grace to print the graph # using the postscript device driver. The -hardcopy option prevents an # interactive session of grace from starting. See grace's documentation # for a description of all command line options. # # The program does the following things: # 1. prints statistics to standard out # 2. creates a grace project file called box.gr or NAME.gr # 3. executes xmgrace with the newly created file # # Two files must be present to run this program: grbox, and DATA.FILE # # To execute: First, type the appropriate command line as given above # Second, use grace to edit the resulting plot to your satisfaction # Save the results. # BEGIN{ print " " print "Box plot preprocessor to xmgrace" print "...reading data" } # # Ignore comment lines (enable reading of rdb tables) and read only those data lines # with numbers in field two # $0 !~ /^#/ && $2~/^-?(([0-9]+\.?[0-9]*)|([0-9]*\.?[0-9]+))([eE]-?\+?[0-9]+)?$/{ if(first_rec_flag < 1){ # if first record of data count=0 # set counters and box name nbox++ name[nbox]=$1 oldbox=$1 first_rec_flag++ } if($1 != oldbox && first_rec_flag > 0){ # after first record nbox++ # set counters when box name changes name[nbox]=$1 # and save box name oldbox=$1 n_data[nbox-1]=count count=0 } total_count++ # count total number of data points count++ # count number of data points in this box data_all[nbox,count]=$2 # store data } END{ # # Print statistics for the input data # print "...read a total of",total_count,"data values for",nbox,"boxplots" print " from a file with",NR,"lines" # # Check for boxplot type set on command line == 1090 # If not close to 1090 then set to default # if(type ~ /^10/){ type="1090" # set type to box graph }else{ type="tukey" # set type to default tukey plot } # # Check for filename prefix set on command line # If not given on the command line, set grace output file name to "box.gr" # if(length(prefix)>0){ gr_tmplt= prefix ".gr" # add gr to prefix for grace file }else{ gr_tmplt= "box.gr" # default output file name } # # Print header for grace file. # print "# Grace project file" > gr_tmplt print "# " > gr_tmplt print "@version 50100" > gr_tmplt # # Initialize counters for boxes and for outside and farout points. # num_box=0 # number of boxes to plot num_out=0 # number of outside points num_far=0 # number of farout points for(i=1;i<=nbox;i++){ plotbox[i]=1 # flag for plotting box i: 1=plot, 0=no plot nout[i]=0 # outside points in box i nfar[i]=0 # farout points in box i } # # For tukey's schematic plot-- # Compute percentiles and whisker locations, print results to stdout. # Find and write outside and farout points. # if(type=="tukey"){ print " " print "...drawing Tukey's schematic plot" print " " n_data[nbox]=count # save number of points for last box for(i=1;i<=nbox;i++){ # loop thru all boxes print "...working on box",name[i],"which is box",i,"out of a total of",nbox,"boxes" for(j=1;j<=n_data[i];j++)data[j]=data_all[i,j] # read 1d part of 2d array if(n_data[i] < 200)qsort(data,1,n_data[i]) # sort data to compute percentiles else unixsort(data,n_data[i]) # qsort (small n), unixsort (large n) if(i==1){ ymin = data[1] ymax = data[n_data[i]] } if(i>1 && data[1]1 && data[n_data[i]]>ymax)ymax=data[n_data[i]] if(n_data[i]<3){ # if n < 3 then quartiles algorithm makes no sense print "***N LESS THAN 3 (n=",n_data[i] ")..." plotbox[i]=0 for(j=1;j<=n_data[i];j++){ num_far++ nfar[i]++ data_far[i,nfar[i]]=data[j] } continue # move on to the next box } num_box++ p25=percentile(data,25,n_data[i]) # compute percentiles p50=percentile(data,50,n_data[i]) p75=percentile(data,75,n_data[i]) lowstep=(p25-1.5*(p75-p25)) # compute lower extreme for whisker lowstep2=(p25-2*1.5*(p75-p25)) # compute lower extreme for far out points upstep=(p75+1.5*(p75-p25)) # compute upper extreme for whisker upstep2=(p75+2*1.5*(p75-p25)) # compute upper extreme for far out points lw=p25 # initialize lower whisker to p25 uw=p75 # initialize upper whisker to p75 # # Calculate outside and farout points. # Find upper and lower whisker points. # for(k=1;k<=int(.25*n_data[i]+1);k++){ # search for lower whisker & outside points if(data[k]=lowstep && data[k]=int(.75*n_data[i]);k--){ # search for upper whisker & outside points if(data[k]>upstep2){ num_far++ nfar[i]++ data_far[i,nfar[i]]=data[k] # farout points continue # if point is farout continue loop on next point } if(data[k]>upstep){ num_out++ nout[i]++ data_out[i,nout[i]]=data[k] #outside points continue # if point is outside continue loop on next point } if(data[k]<=upstep && data[k]>p75){ uw=data[k] # set lower whisker value to first point within whisker break # if point is first within whisker, then stop loop } } # # Save box stats to an array. # box_lw[i] = lw box_lo[i] = p25 box_md[i] = p50 box_hi[i] = p75 box_uw[i] = uw # # Print stats to std out. # if(n_data[i]<12)print "**** N=",n_data[i]," NOTE: For small data sets, check results****" print " stats----> n lower-step2 lower-step1 lowwhisker p25" printf " %12d %12.4g %12.4g %12.4g %12.4g\n",n_data[i],lowstep2,lowstep,lw,p25 print " ----> p50 p75 upwhisker upper-step1 upper-step2" printf " %12.4g %12.4g %12.4g %12.4g %12.4g\n",p50,p75,uw,upstep,upstep2 print " " } } # # For box graphs-- # Compute percentiles, print results to stdout, and print out plot positions. # if(type=="1090"){ print " " print "...drawing box graph with whiskers to the 10th and 90th percentiles" print " " n_data[nbox]=count # save number of values for last box for(i=1;i<=nbox;i++){ # loop thru all boxes print "...working on box",name[i],"which is box",i,"out of a total of",nbox,"boxes" for(j=1;j<=n_data[i];j++)data[j]=data_all[i,j] # read 1d part of 2d array if(n_data[i] < 200)qsort(data,1,n_data[i]) # sort data to compute percentiles else unixsort(data,n_data[i]) # qsort (small n), unixsort (large n) if(i==1){ ymin = data[1] ymax = data[n_data[i]] } if(i>1 && data[1]1 && data[n_data[i]]>ymax)ymax=data[n_data[i]] if(n_data[i]<9){ # if n < 9 than algorithm for 10th & 90th percentiles makes no sense print "***N LESS THAN 9 (n=",n_data[i] ")..." plotbox[i]=0 for(j=1;j<=n_data[i];j++){ num_far++ nfar[i]++ data_far[i,nfar[i]]=data[j] } continue # move on to the next box } num_box++ p10=percentile(data,10,n_data[i]) # compute percentiles p25=percentile(data,25,n_data[i]) p50=percentile(data,50,n_data[i]) p75=percentile(data,75,n_data[i]) p90=percentile(data,90,n_data[i]) print " stats: n,p10,p25,p50,p75,p90=",n_data[i],p10,p25,p50,p75,p90 # # Find points that are outside of whiskers. # for(k=1;k<=int(.10*n_data[i]+1);k++){ if(data[k]=int(.90*n_data[i]);k--){ if(data[k]>p90){ num_far++ nfar[i]++ data_far[i,nfar[i]]=data[k] # plot with circles } } # # Save box stats to an array. # box_lw[i] = p10 box_lo[i] = p25 box_md[i] = p50 box_hi[i] = p75 box_uw[i] = p90 } } # # Define world and viewport limits. Set symbol definitions. # print "@with g0" > gr_tmplt print "@g0 on" > gr_tmplt if(yaxis=="log" || yaxis=="LOG"){ print "@g0 type logy" > gr_tmplt }else{ print "@g0 type xy" > gr_tmplt } print "@ world xmin 0.0" > gr_tmplt print "@ world xmax "nbox+1 > gr_tmplt print "@ world ymin "ymin > gr_tmplt print "@ world ymax "ymax > gr_tmplt print "@ view xmin 0.15" > gr_tmplt print "@ view xmax 0.85" > gr_tmplt print "@ view ymin 0.15" > gr_tmplt print "@ view ymax 0.85" > gr_tmplt print "@ s0 type xyboxplot" > gr_tmplt print "@ s0 line type 0" > gr_tmplt print "@ s0 symbol 0" > gr_tmplt print "@ s0 symbol linestyle 1" > gr_tmplt print "@ s0 symbol linewidth 1" > gr_tmplt print "@ s0 symbol color 1" > gr_tmplt print "@ s0 errorbar size 0.0" > gr_tmplt print "@ s1 type xy" > gr_tmplt print "@ s1 line type 0" > gr_tmplt print "@ s1 symbol 9" > gr_tmplt print "@ s1 symbol size 0.65" > gr_tmplt print "@ s1 symbol color 1" > gr_tmplt print "@ s1 symbol linestyle 1" > gr_tmplt print "@ s1 symbol linewidth 1" > gr_tmplt print "@ s2 type xy" > gr_tmplt print "@ s2 line type 0" > gr_tmplt print "@ s2 symbol 1" > gr_tmplt print "@ s2 symbol size 0.50" > gr_tmplt print "@ s2 symbol fill 0" > gr_tmplt print "@ s2 symbol color 1" > gr_tmplt print "@ s2 symbol linestyle 1" > gr_tmplt print "@ s2 symbol linewidth 1" > gr_tmplt # # Plot x-axis custom labels to grace file. # print "@ xaxis tick on" > gr_tmplt print "@ xaxis tick major 1.0" > gr_tmplt print "@ xaxis tick minor ticks 0" > gr_tmplt print "@ xaxis ticklabel start type spec" > gr_tmplt print "@ xaxis ticklabel start 1.0" > gr_tmplt print "@ xaxis ticklabel stop type spec" > gr_tmplt print "@ xaxis ticklabel stop "nbox > gr_tmplt print "@ xaxis tick spec type both" > gr_tmplt print "@ xaxis tick spec "nbox > gr_tmplt for(i=1;i<=nbox;i++){ print "@ xaxis tick major "i-1", "i > gr_tmplt print "@ xaxis ticklabel "i-1", \""name[i]"\"" > gr_tmplt } print "@ yaxis tick on" > gr_tmplt if(yaxis=="log" || yaxis=="LOG"){ print "@ yaxis tick major 10" > gr_tmplt print "@ yaxis tick minor ticks 9" > gr_tmplt }else{ print "@ yaxis tick major "(ymax-ymin)/5.0 > gr_tmplt print "@ yaxis tick minor ticks 3" > gr_tmplt } # # Plot data points to the grace file. # print "@with g0" > gr_tmplt print "@g0 on" > gr_tmplt if(num_box>0){ # boxes print "@type xyboxplot" > gr_tmplt print "@target s0" > gr_tmplt for(i=1;i<=nbox;i++){ if(plotbox[i]==1){ if(label=="number"){ printf "%16lg ",name[i] > gr_tmplt }else{ printf "%16lg ",i > gr_tmplt } printf "%16lg %16lg %16lg %16lg %16lg\n",box_md[i],box_lo[i],box_hi[i],box_lw[i],box_uw[i] > gr_tmplt } } print "&" > gr_tmplt } if(num_out>0){ # outside points -- x's print "@type xy" > gr_tmplt print "@target s1" > gr_tmplt for(i=1;i<=nbox;i++){ for(j=1;j<=nout[i];j++){ if(label=="number"){ printf "%16lg %16lg\n",name[i],data_out[i,j] > gr_tmplt }else{ printf "%16lg %16lg\n",i,data_out[i,j] > gr_tmplt } } } print "&" > gr_tmplt } if(num_far>0){ # farout points -- circles print "@type xy" > gr_tmplt print "@target s2" > gr_tmplt for(i=1;i<=nbox;i++){ for(j=1;j<=nfar[i];j++){ if(label=="number"){ printf "%16lg %16lg\n",name[i],data_far[i,j] > gr_tmplt }else{ printf "%16lg %16lg\n",i,data_far[i,j] > gr_tmplt } } } print "&" > gr_tmplt } # # Execute grace with newly-created data file. # print " " print "...grace file named: "gr_tmplt print " " print "...starting xmgrace and loading "gr_tmplt"..." system("xmgrace " gr_tmplt " " flags " &") print " " print "Done." } # # Percentile function...following Helsel and Hirsh # function percentile(A,p,n, t,low,high) { t=(n+1)*p/100 # position in sorted data set low=int(t) # actual position below high=low+1 # actual position above return A[low]+(t-int(t))*(A[high]-A[low]) # interpolated value } # # Qsort function--quicksort from "The awk programming language" p161 # by Aho, Kernighan, Weinberger # function qsort(A,left,right, i,last) { if(left>=right) #do nothing if array contains return # less than two elements swap(A,left,left+int((right-left+1)*rand())) last=left # A[left] is now partition element for(i=left+1;i<=right;i++) if(A[i] "tmp0987654321xxx" i=0 while("sort -n tmp0987654321xxx" | getline){ i++ A[i]=$1 } close("tmp0987654321xxx") close("sort -n tmp0987654321xxx") system("rm tmp0987654321xxx") }