This sample is very handy for handing data extract
files containing embedded CR and LF character within text
data. Originally created to process Sybase BCP files.
#!/usr/bin/perl
#===================================
# text2rec : Remove embedded tabs/CR within BCP-out files
# ( 99% successful )
#
# Arguments: infile outfile
#
#===================================
$cmtfile=$ARGV[0];
$outfile=$ARGV[1];
open(FILE1,"<$cmtfile") or die "Input file: Cannot open $cmtfile\n\n";
open(FILE2,">$outfile") or die "Output file: Cannot write $outfile\n\n";
# number if lines, for current record
$count2 = 0 ;
while (<FILE1>)
{
$line1 = $_;
chomp $line1 ;
$field1=" ";
$field2=" ";
# replace tabs with pipes, good for debugging
$line1 =~ s/\t/\|/g ;
# replace bizarre characters
$line1 =~ s/<B0>/ /g ;
$line1 =~ s/<ED>/ /g ;
# attempt to grab first two fields of line
if ($line1 =~ /\|/)
{
@line1 = split(/\|/,$line1);
$field1 = shift @line1 ;
$field2 = shift @line1 ;
}
# --> Check for record start.
# check if the fields are for real, i.e. numeric / all caps
# ... two examples are below, will need to modify for
# different file layouts
# 2 numeric fields
# if ( ($field1 =~ /[0-9]/) && ($field2 =~ /[0-9]/) )
# code field, numeric field
if ( ($field1 =~ /[A-Z][A-Z]/) && ($field2 =~ /[0-9]/) )
{
if ($line2=~/[a-z]/)
{
$line2 =~ s/\|/\t/g ;
print FILE2 $line2,"\n";
}
$line2 = $line1 ;
$count2 = 0 ;
}
else
{
# ---> Not a record start, but a record chunk.
# check for embedded tabs, within the text field; an attempt
# to remove them here.
if (!($line1 =~ /\|[0-9]/))
{
$line1 =~ s/\|/ /g ;
}
$line2 = $line2 . " " . $line1 ;
$count2 ++ ;
# print warning, for excessively long
# text fields.
# If you see this message for all lines in the file,
# then the if..then above is never succeeding, and needs
# to be adjusted
if ($count2 > 300)
{ print "Limit exceeded: $count2 \n"; }
}
}
# last line
$line2 =~ s/\|/\t/g ;
print FILE2 $line2,"\n";
close(FILE2);
close(FILE1);
|
|