options nocenter nodate nonumber formdlim="-"; *Arrays Seminar; *Recoding variables; *Note: single dash in variable list means numerically consecutive; data faminc; input famid faminc1-faminc12 ; cards; 1 3281 3413 3114 2500 2700 3500 3114 3319 3514 1282 2434 2818 2 4042 3084 3108 3150 3800 3100 1531 2914 3819 4124 4274 4471 3 6015 6123 6113 6100 6100 6200 6186 6132 3123 4231 6039 6215 ; run; /*heading option controls direction of col headings*/ proc print data=faminc heading=H; run; *Using if-then manually; data recode_manual; set faminc; if faminc1 < 3000 then faminc1=.; if faminc2 < 3000 then faminc2=.; if faminc3 < 3000 then faminc3=.; if faminc4 < 3000 then faminc4=.; if faminc5 < 3000 then faminc5=.; if faminc6 < 3000 then faminc6=.; if faminc7 < 3000 then faminc7=.; if faminc8 < 3000 then faminc8=.; if faminc9 < 3000 then faminc9=.; if faminc10 < 3000 then faminc10=.; if faminc11 < 3000 then faminc11=.; if faminc12 < 3000 then faminc12=.; run; proc print data=recode_manual noobs heading=H; run; *Recoding variables using an array; data recode_array; set faminc; array Afaminc(12) faminc1-faminc12; do i = 1 to 12; if Afaminc[i] < 3000 then Afaminc[i] = . ; end; drop i; run; proc print data=recode_array noobs heading=H; run; ********************************************************; *Reverse items on a -3 to +3 scale using an array; *Input the score data set; data score; input item1 item2 item3 item4; cards; -2 1 -3 0 -1 2 -2 1 0 -1 -3 -1 ; run; proc print data=score; run; data score_array1; set score; array item(4) item1-item4; do i = 1 to 4; item[i] = -1*item[i]; end; run; proc print data=score_array1; run; ********************************************; *Computing new variables; *computing the tax income variables manually; data tax_manual; set faminc; taxinc1 = faminc1 * .10 ; taxinc2 = faminc2 * .10 ; taxinc3 = faminc3 * .10 ; taxinc4 = faminc4 * .10 ; taxinc5 = faminc5 * .10 ; taxinc6 = faminc6 * .10 ; taxinc7 = faminc7 * .10 ; taxinc8 = faminc8 * .10 ; taxinc9 = faminc9 * .10 ; taxinc10= faminc10 * .10 ; taxinc11= faminc11 * .10 ; taxinc12= faminc12 * .10 ; run; proc print data=tax_manual noobs heading=H; var famid faminc1-faminc12 taxinc1-taxinc12; run; *computing the same tax income variables using an array; *The second array Ataxinc because we are creating 12 new variables; data tax_array; set faminc; array Afaminc(12) faminc1-faminc12; /* existing vars */ array Ataxinc(12) taxinc1-taxinc12; /* new vars */ do month = 1 to 12; Ataxinc[month] = Afaminc[month]*0.1; end; run; proc print data=tax_array noobs heading=H; var famid faminc1-faminc12 taxinc1-taxinc12; run; **********************************************************; *Collapsing over variables; *Creating the total income per quarter variables manually; data quarter_manual; set faminc; incq1 = faminc1 + faminc2 + faminc3; incq2 = faminc4 + faminc5 + faminc6; incq3 = faminc7 + faminc8 + faminc9; incq4 = faminc10 + faminc11 + faminc12; run; proc print data=quarter_manual; var incq1 faminc1-faminc3; run; *Creating the total income per quarter variables using arrays; data quarter_array; set faminc; array Afaminc(12) faminc1-faminc12; /* existing vars */ array Aquarter(4) incq1-incq4; /* new vars */ do q = 1 to 4; Aquarter[q] = Afaminc[3*q-2] + Afaminc[3*q-1] + Afaminc[3*q]; end; run; /* For q=1: Aquarter[1] = Afaminc[3*1-2] + Afaminc[3*1-1] + Afaminc[3*1] = Afaminc[1] + Afaminc[2] + Afaminc[3] For q=2: Aquarter[2] = Afaminc[3*2-2] + Afaminc[3*2-1] + Afaminc[3*2] = Afaminc[4] + Afaminc[5] + Afaminc[6] */ proc print data=quarter_array; var incq1 faminc1-faminc3; run; ***********************************************************; *ID patterns across variables using arrays; *ID months were income was less than half of previous month; *Store information in dummy variables lowinc2-lowinc12; *loop over months 2-12 (month 1 has no previous month!); *Variable ever indicates if this ever happened; data pattern; set faminc; length ever $ 4; array Afaminc(12) faminc1-faminc12; /*existing vars*/ array Alowinc(2:12) lowinc2-lowinc12; /* new vars */ do m = 2 to 12; if Afaminc[m] < (Afaminc[m-1] / 2) then Alowinc[m] = 1; else Alowinc[m] = 0; end; sum_low = sum(of lowinc:); /*sums over all vars with lowinc as part of name */ if sum_low > 0 then ever='Yes'; if sum_low = 0 then ever='No'; drop m sum_low; run; proc print data=pattern noobs heading=H; run; ************************************************************; *Reshaping wide to long; *Reshaping wide to long creating only one variable--Manually; data wide; input famid faminc96 faminc97 faminc98 ; cards; 1 40000 40500 41000 2 45000 45400 45800 3 75000 76000 77000 ; run; data long_manual; set wide; year=96; faminc=faminc96; output; year=97; faminc=faminc97; output; year=98; faminc=faminc98; output; run; proc print data=long_manual; var famid year faminc; run; *illustrating the importance of all the output statements; data problem; set wide; year=96; faminc=faminc96; *output; year=97; faminc=faminc97; *output; year=98; faminc=faminc98; output; run; proc print data=problem; var famid year faminc; run; *reshaping wide to long creating only one variable using arrays; *Note: we need the output statement inside the do loop; *because we want to output after each year, just like in the above ex.; data long_array; set wide; array Afaminc(96:98) faminc96 - faminc98; do year = 96 to 98; faminc = Afaminc[year]; output; end; drop faminc96-faminc98; run; proc print data=long_array; run; *Reshaping wide to long creating multiple variables (including string vars) using arrays; data multi_wide; length debt96 $ 3 debt97 $ 3 debt98 $ 3; input famid faminc96 faminc97 faminc98 spend96 spend97 spend98 debt96 debt97 debt98 ; cards; 1 40000 40500 41000 38000 39000 40000 yes yes no 2 45000 45400 45800 42000 43000 44000 yes no no 3 75000 76000 77000 70000 71000 72000 no no no ; run; proc print data=multi_wide; run; data multi_long; set multi_wide; length debt $ 3; array Afaminc(96:98) faminc96-faminc98; array Aspend(96:98) spend96-spend98; array Adebt(96:98) debt96-debt98; do year = 96 to 98; faminc = Afaminc[year]; spend = Aspend[year]; debt = Adebt[year]; output; end; drop faminc96-faminc98 spend96-spend98 debt96-debt98; run; proc print data=multi_long; var famid year faminc spend debt; run; *Reshaping wide to long in presence of character suffixes; *In the above example we had numeric suffixes (96, 97 and 98); *We can reshape even if we have character suffixes such (old, now, future); data character; length name_old $ 24 name_now $ 24 name_future $ 24; input id name_old $ name_now $ name_future $ inc_old inc_now inc_future; cards; 1 Ramon Martin Martin_Sheen 23000 50000 700000 2 John Johnnie J_boy 10000 20000 600000 3 Mary_Cathleen Bo Bo_Derek 15000 40000 250000 ; run; *double dash means positionally consecutive; proc print data=character noobs; var id name_old--name_future inc_old--inc_future; run; data character_array; set character; length name $ 24; array Aname(3) $ name_old name_now name_future; array Aincome(3) inc_old inc_now inc_future; do time= 1 to 3; name = Aname[time]; income = Aincome[time]; output; end; run; proc format; value t_format 1='old' 2='now' 3='future'; run; proc print data=character_array noobs; format time t_format.; var id time name income; run; ******************************************************************************; *Understanding the functions first. and last. as well as the retain statement.; *In order to understand how to use arrays to reshape from long to wide we ; *will need to understand how the first. and last. functions work as well ; *as understand how the retain statement works.; *Looking at examples of the retain statement.; data missings; input id measurement; cards; 1 . 1 2 3 . 2 3 3 4 2 . 3 . 1 . 3 5 3 6 ; run; data ex_retain; set missings; retain new_meas 0; /*try running it with this line as a comment*/ if measurement ne . then new_meas = measurement; run; proc print data=ex_retain; run; data ex2_retain; set missings; retain new1 0; /*try running it with this line commented out!*/ if measurement ne . then new1 = new1 + measurement; run; proc print data=ex2_retain; run; *Examples of using the first. and last. functions.; *We will be using first.id and last.id so sorting on id is necessary; proc sort data=missings out=sort_miss; by id; run; *The by statement must precede the function call to first. and last.; data ex1; set sort_miss; by id; if first.id then first=1; else first=0; if last.id then last=1; else last=0; run; proc print data=ex1; run; *Combining the first. function with a retain statement to get a running sum and count.; data kids; length kidname $ 4; input famid kidname birth_order wt; cards; 1 Beth 1 60 1 Barb 3 20 4 Sam 1 100 4 Stu 2 90 1 Bob 2 40 3 Pete 1 60 3 Phil 3 20 2 Andy 1 80 3 Pam 2 40 2 Al 2 50 2 Ann 3 20 ; run; *we will be using first.famid and last.famid so sorting on famid is necessary; proc sort data=kids out=sort_kids; by famid; run; data retain1; set sort_kids; retain sumwt count; /*carry over the value from previous observation to next obs*/ by famid; /* requires sorting */ if first.famid then do; /*at first obs of each family set sumwt and count equal to 0*/ sumwt=0; count=0; end; sumwt = sumwt + wt; count = count + 1; meanwt = sumwt/count; *if last.famid then output; /*output only the last obs for each family*/ run; proc print data=retain1 noobs; var famid kidname wt sumwt count meanwt; run; *outputting only the final sumwt, count and meanwt per family; data retain2; set retain1; by famid; if last.famid then output; /*output only the last obs for each family*/ run; proc print data=retain2 noobs; var famid sumwt count meanwt; run; *******************************************************************; *Reshaping long to wide using arrays; *We will use the long_array data set created from the wide data set; *and we will reshape it back to the original wide format; proc print data=long_array; run; *we will be using first.famid so sorting on famid is necessary; proc sort data=long_array out=long_sort; by famid; run; data wide_array1; set long_sort; by famid; retain faminc96-faminc98; array Afaminc(96:98) faminc96-faminc98; if first.famid then do; do i = 96 to 98; Afaminc[i] = .; end; end; Afaminc(year) = faminc; /*looping across values in the variable year*/ *if last.famid then output; /*outputs only the last obs per family*/ drop year faminc i; run; proc print data=wide_array1 noobs; run; data wide_array; set long_sort; by famid; retain faminc96-faminc98; array Afaminc(96:98) faminc96-faminc98; if first.famid then do; do i = 96 to 98; Afaminc[i] = .; end; end; Afaminc(year) = faminc; /*looping across values in the variable year*/ if last.famid then output; /*outputs only the last obs per family*/ drop year faminc i; run; proc print data=wide_array noobs; run; proc print data=wide noobs; run; *************************************************************; *A more subtle use of arrays. An issue in SAS data management is that *we can not access multiple observations at one time. Thus, we can not do *comparisons across observations. *One solution to this problem is to transpose the data from long to wide; *then we can use the array to do the comparisons very easily.; *Goal: to compare each observation with the previous and the next observation; *If they are the same then flag the observation.; data real_life; input person topicA; cards; 1 0 1 1 3 -1 1 0 2 0 1 1 2 -1 2 -1 3 0 3 1 4 0 1 1 4 1 4 0 2 -1 4 0 4 0 1 -1 ; run; proc print data=real_life noobs; run; *First we need to number the observations within each person; *we must sort on person since we will be using first.person; proc sort data=real_life out=sort_real; by person; run; data count_real; set sort_real; retain count; by person; if first.person then count = 0; count = count + 1; run; proc print data=count_real noobs; run; data wide_real; set count_real; array AtopicA(6) topicA_1-topicA_6; retain topicA_1-topicA_6; by person; if first.person then do; do i = 1 to 6; AtopicA[i] = .; /* setting everything to missing */ end; end; AtopicA(count) = topicA; /*looping across values in the variable count*/ if last.person then output; /* outputs only the last obs per person */ run; proc print data=wide_real noobs; var person topicA_1-topicA_6; run; *Now, let's find the people who have the same value for 3 observations in a row; data three; set wide_real; array topic(6) topicA_1-topicA_6; do i = 2 to 5; if topic[i-1] ne . & topic[i] ne . & topic[i+1] ne . & topic[i]=topic[i-1] & topic[i]=topic[i+1] then flagA=1; end; if flagA=. then flagA=0; run; proc print data=three noobs; var person topicA_1-topicA_6 flagA; run;