attached below is a perl script that "fixes" codebooks. specifically: Default operation is: - read file with newlines - replace octal 342 and 343 with brackets - remove leading formatting characters (+-01) - replace bad ASCII chars (high and control cahrs) with spaces - strips trailing blanks off lines Each of the above default operations can be overidden (see flags below). In additon the -w flag tells the program to strip leading blanks (number specified after -w). -l Logical Record Length of input (default is newline) -b do not replace 342, 343 with [,] -n do not remove leading formatting chars -r do not replace bad ascii with blanks -s do not strip trailing blanks -w strip leading blanks -v Verbose mode -h Help (prints this message) -- Treat remaining arguments as filenames - Use stdin as input #!/usr/local/bin/perl # $Source: /u3/ssdb/asinger/bin/RCS/chunks,v $ # $Revision: 1.6 $ # $Date: 91/09/23 17:30:39 $ # $State: Exp $ # $Author: asinger $ #------------------------------------------------------------------ # Abe Singer (abe@ucsd.edu) modified by Jim Jacobs (jajacobs@ucsd.edu) # Central University Library, Mail Code 0175-R # Social Sciences Database Project # University of California, San Diego # San Diego, California 92122 # (619) 534-5758 #------------------------------------------------------------------ # This perl(1L) script fixes codebooks by adding newlines, removing # trailing blanks, changing the first character to a space, and # removing any stray ^Z's. # # Options: # # -l Logical Record Length of infile (default is newline) # -b do not replace 342, 343 with [,] # -n do not remove leading formatting chars # -r do not replace bad ascii with blanks # -s do not strip trailing blanks # -w strip leading blanks # -d Debug mode # -t test mode (bails before processing) # -v verbose mode. Prints out settings and stuff # -h help. Prints usage message and exits # # # #------------------------------------------------------------------ # # Globals... chop($progName = `basename $0`); $chunk = 32000 ; $brackets = 1 ; $replace_bad = 1 ; $strip = 1; $noForm = 1; $leading = 0 ; $test = 0; $verbose = 0; $debug = 0; $lrl = 0 ; #******************************************************************************* # Name : Usage # Purpose : Prints out usage message and bails # Arguments : 1 - String containing useful error message. # Return Value : None # Calls : Exit # Globals Accessed : # Notes : #******************************************************************************* sub Usage { local ($msg) = @_ ; $, = " "; $\ = "\n"; print STDERR "$progName : ", @_; print STDERR <] [-w ] [files] This perl(1L) script fixes codebooks. Default operation is: - read file with newlines - replace octal 342 and 343 with brackets - remove leading formatting characters (+-01) - replace bad ASCII chars (high and control cahrs) with spaces - strips trailing blanks off lines Each of the above default operations can be overidden (see flags below). In additon the -w flag tells the program to strip leading blanks (number specified after -w). -l Logical Record Length of input (default is newline) -b do not replace 342, 343 with [,] -n do not remove leading formatting chars -r do not replace bad ascii with blanks -s do not strip trailing blanks -w strip leading blanks -v Verbose mode -h Help (prints this message) -- Treat remaining arguments as filenames - Use stdin as input EndOfText exit(-1); } #******************************************************************************* # Name : ParseArgs # Purpose : Reads command line arguments, looking for flags. # Arguments : None. # Return Value : None # Calls : &Usage # Globals Accessed : @ARGV # Notes : "--" means to treat remaining arguments as file names # "-" means use stdin as an input file # stdin is used as input if no files specified #******************************************************************************* sub ParseArgs { local ($_); local (@files); local ($flag, $rest); LOOP: while ($_ = shift(@ARGV)) { # "--" means interpret remaining arguments as filenames if (/^--$/) { @files = (@files, @ARGV); last LOOP; } # A "-" or anything not beginning with "-" is a filename if (!/^-./) { @files = (@files, $_); next LOOP; } # if we got this far, it must be a flag s/^-//; FLAGS: while ($_) { # Parse individual characters in argument m/^(.)(.*)$/; $flag = $1; $rest = $2; $_ = $flag; if (/d/) { $debug++; } elsif (/l/) { $lrl = shift(@ARGV); } elsif (/w/) { $leading = shift(@ARGV); } elsif (/b/) { $brackets=0; } elsif (/r/) { $replace_bad=0; } elsif (/s/) { $strip=0; } elsif (/n/) { $noForm=0; } elsif (/v/) { $verbose++; } elsif (/t/) { $test++; } elsif (/h/) { &Usage("Usage"); } else { &Usage("Unknown argument: $_"); } $_ = $rest; } } # If no files specified, make STDIN the input file. @files = ("-") if ($#files < 0); @ARGV = @files; } #******************************************************************************* # Name : Verbose # Purpose : Dumps out settings # Arguments : None. # Return Value : None # Calls : None # Globals Accessed : All of em #******************************************************************************* sub Verbose { local($ors, $ofs) = ($\, $,); $, = " "; $\ = "\n"; print "$progName: Settings are..."; print "lrl is $lrl"; print "strip is", $strip ? "on" : "off"; print "brackets is", $brackets ? "on" : "off"; print "test is", $test ? "on" : "off"; print "leading is", $leading ? "on" : "off"; print "noForm is", $noForm ? "on" : "off"; print "replace_bad is", $replace_bad ? "on" : "off"; print "debug is", $debug ? "on" : "off"; print "files are:", @ARGV; $, = ofs; $\ = ors; } #******************************************************************************* # # Main() starts here... # #******************************************************************************* &ParseArgs; &Verbose if ($verbose); # Print out stats if requested. exit(0) if ($test); # Go bye bye if in test-mode # Now we start reading input files... # first if lrl is set use "read" if ($lrl) { # Set output end-of-line marker to newline, so's that records will # automagically be new-line delimited $\ = "\n"; $, = ""; READFILES: foreach $infile (@ARGV) { $recnum = 0; print STDERR "$progName: Reading file $infile" if $verbose; open(INFILE, $infile) || die "Can't open input file!\n"; while ($bytes = read(INFILE, $buf, $lrl)) { $recnum++; # Check for read errors... die "$progName: System read error $!, file $infile, " . "record $recnum\n" if (!defined $bytes); # Check to make sure enough bytes are read if ($bytes != $lrl) { $l = length($buf); warn "$progName: Incomplete record, file $infile, " . "record $recnum, length $l.\n" ; } $_ = $buf; &fix ; } close(INFILE); } } # end of if $lrl ###################################################################### # else if $lrl is not set, assume file has newlines. ###################################################################### else { #$\ = "\n"; foreach $infile (@ARGV) { open (INFILE, $infile) || die "can not open $infile" ; while () { &fix ; } close(INFILE); } } # We're outta here! exit(0); ###################################################################### # here is where the work is done. ###################################################################### sub fix { # replace 342 and 343 octal with [ and ] tr/\342/[/ if $brackets ; tr/\343/]/ if $brackets ; # replace bad chars with blanks if desired ($_ =~ s/[\000-\011]|[\013-\037]|[\177-\377]/ /g) if ($replace_bad); # Strip trailing whitespace if desired ($_ =~ s/ *$//) if ($strip) ; # Strip out leading formatting characters replace with blanks s/^[01+-]/ / if ($noForm); # remove leading blanks if desired s/^ {0,$leading}// if ($leading); # Print out the line print $_; } sub first_check { open(INFILE, $infile) || die "Can't open input file!\n"; while (read (INFILE, $tmp, $chunk) ) { if (!( $tmp =~ /\n/ )) { $length = length($tmp); print STDERR " ERROR: No Newlines! $progName exiting. $length bytes, had no newline delimiter, and no lrl set. Use the -l flag to set the logical record length of files that have no newline characters. "; exit (1); } # end of if else { close (INFILE) ; } } # end of while } # end of sub first_check