# Last update: 2003/12/07 # # HTML Table to CSV # # (p) Jerry Nagasaki # # # Converts "simple" n*m HTML tables (w/o merged cells or nested # tables) to a csv file. # # INPUT: Extracted HTML table ( ...
) named "table.html" # OUTPUT: Generated CSV File (;) named "table.csv" # # (optional 2nd output: "Plain Table" named "table2.html") # # # Notes: # - New-line breaks in -texts are NOT converted to a "simple space" # - Empty entries should be filled with a character in the source file # open(FILE, "; close(FILE); @dummy = ""; @dummy2 = ""; @table2 = ""; $newelem = ""; foreach (@infile) { $_ =~ s/\n//g; # remove \n of line $newelem = $newelem.$_; } push (@dummy, $newelem); $i = 0; foreach (@dummy) { print "$i $_"; $i++; $_ =~ s/\ / /g; # turn   to "simple space" $_ =~ tr/ //s; # remove multiple spaces $_ =~ s/\t//g; # remove tabs $_ =~ s///gi; # remove additional table arguments $_ =~ s///gi; # remove additional tr arguments $_ =~ s// $_ =~ s/\n//gi; # remove all \n $_ =~ s/
/gi; # remove additional th arguments $_ =~ s///gi; # remove additional td arguments $_ =~ s///gi; # remove $_ =~ s/<\/font>//gi; # remove $_ =~ s///g; # remove $_ =~ s///gi; # remove comments $_ =~ s///gi; # remove links $_ =~ s/<\/a>/ /gi; # remove /links $_ =~ s/\s*>/>/g; # remove spaces in tags $_ =~ s/>\s/>/g; # remove space after tags $_ =~ s/
//g; # remove
$_ =~ s/<\/center>//g; # remove
$_ =~ s/
//gi; # remove line break tags $_ =~ s///gi; # remove style tags $_ =~ s/<\/code>//gi; $_ =~ s///gi; # remove $_ =~ s/<\/var>/ /gi; # remove $_ =~ s///gi; # remove intalic tags $_ =~ s/<\/i>//gi; $_ =~ s/

//gi; # remove paragraph tags $_ =~ s/<\/p>//gi; $_ =~ s///gi; # remove bold tags $_ =~ s/nowrap//g; # remove nowrap $_ =~ s/<\/b>//gi; $_ =~ s/

//gi; # trun to $_ =~ s/<\/th>/<\/td>/gi; # trun to
/
\n/gi; # set \n after
$_ =~ s//\n/gi; # set \n after $_ =~ s/<\/tr>/<\/tr>\n/gi; # set \n after $_ =~ s/<\/td>/<\/td>\n/gi; # set \n after $_ =~ s/\s//gi; # remove
$_ =~ s/<\/table>//gi; # remove
$_ =~ s///gi; # remove $_ =~ s///gi; # remove $_ =~ s/<\/td>\n/;/gi; # turn to ; $_ =~ s/\n//g; # remove all \n $_ =~ s/<\/tr>/\n/gi; # turn to \n push (@dummy2, $_); } # make all elements of @dummy-$_ stream until \n to ONE element # (for finding last ";") @outfile = ""; $newelem = ""; foreach (@dummy2) { $newelem = $newelem.$_; if ($_ =~ /\n/) # make new element if \n found { $newelem =~ s/;\n/\n/g; # remove last ; of line push (@outfile, $newelem); $newelem = ""; } } open(FILE, ">./table.csv"); print FILE @outfile; close(FILE); # un-comment the following lines, if plain text table should # be generated: #open(FILE, ">./table2.html"); #print FILE @table2; #close(FILE);