#!/usr/bin/perl -w # This is a crude hack to parse the HTML DTDs in order to generate # a HTML parser in perl. The output is a perl structure that describe # the same releationships as the DTD. This script in known to work # on the HTML 3.2 of Tuesday 23-Apr-96, but it might need some # tweaks if that DTD use more sophisticated SGML features. # # Author: Gisle Aas # # $Id: dtd2pm,v 1.4 1998/03/23 09:46:37 aas Exp $ # # Disclaimer: I am not an SGML expert and don't really understand how # to read those damn DTDs. $VERBOSE = 0; undef($/); $DTD = shift || "HTML32.dtd"; open(DTD, $DTD) or die "Can't open $DTD: $!"; $_ = ; close(DTD); $| = 1; ($intro) = //s; # # added this because HTML4.0 as downloaded # doesn't seem to have an HTML.Version ENTITY # so we got an unitialised variable warning # on %entity{%html.version} later # -- pgm ($dtdversion) = /\-\/\/W3C\/\/DTD\s+([^\/]*)\/\/EN/; #print $_; while (s/^\s*//s) { # ignore constructs #print "Skip: \n"; next; } s/^([^>]*)>//; $c = $1; while (1) { # played with different things, # but the KISS principle suggests # we simply eliminate lines for # attributes that are reserved for # possible future use # worry about parsing them when they happen ! -pgm $c =~ s/\s*[^\-]+\-\-\s*reserved\s+for\s+possible\s+future\s+use\s*\-\-//gs; $c =~ s/--.*?--//gs; # remove comments if ($c =~ /--/) { # we really did read too little s/([^>]*)>//; $c .= $1 } else { last; } } $c =~ s/^\s+//; $c =~ s/\s+$//; next unless length $c; # only comments #$c =~ s/\s+/ /g; print "C: $c\n" if $VERBOSE; while (s/^\s*([^\s<]+)//) { print "T: $1\n" if $VERBOSE; } if ($c =~ /^ENTITY\s+(%\s*)?(\S+)\s+(.*)/is) { my($percent, $key, $val) = ($1, lc($2), $3); if ($percent) { $key = "%$key"; } else { $key = "&$key"; $val =~ s/CDATA\s+//; } $val =~ s/^"//s; $val =~ s/"$//s; $val =~ s/(%[\w\.\-]+);?/$entity{lc $1} || $1/eg; $entity{$key} = $val; #print "E: $key => $val\n"; } else { # Expand entities $c =~ s/(%[\w\.\-]+);?/$entity{lc $1} || $1/eg; #print "C: $c\n" if ($c =~ /^ELEMENT\s+\(([^\)]+)\)\s+([-O])\s+([-O])\s+(.*)/is) { my($elems, $start, $stop, $content) = (lc $1, $2, $3, lc $4); for ($elems, $content) { s/\s+//g; } $content =~ s/(\#pcdata)\b/\U$1/g; for $elem (split(/\|/, $elems)) { $element{$elem} = [$start, $stop, $content]; } } elsif ($c =~ /^ELEMENT\s+(\S+)\s+([-O])\s+([-O])\s+(.*)/is) { my($elem, $start, $stop, $content) = (lc $1, $2, $3, lc $4); $content =~ s/\s+//g; $content =~ s/(\#pcdata)\b/\U$1/g; $element{$elem} = [$start, $stop, $content]; } elsif ($c =~ s/^ATTLIST\s+\(([^\)]+)\)\s+//) { my $elems = lc $1; $elems =~ s/\s+//g; my $attrs = parse_attrs($c); for $elem (split(/\|/, $elems)) { $attr{$elem} = $attrs; } } elsif ($c =~ s/^ATTLIST\s+(\S+)\s+//) { $attr{lc $1} = parse_attrs($c); } else { print STDERR "?: $c\n"; } } } # is there anything left? s/^\s+//; print STDERR "?: ", substr($_, 0, 200), "\n" if length $_; # At this point, we have initialized the %element, %attr arrays. # Their content is as described here: # # %element = ( tag => [ $start, $end, $content ], # ... # ); # # %attr = ( tag => { # attr => [ $values, $default ], # ... # }, # ... # ); # # The %entity hash is also available, but should not be of much use # now. # Dump result to stdout so that it is useful to a perl program. print "##### Do not edit!! Auto-generated from $DTD\n\n"; # was : print "package HTML::DTD; # \n\n"; # # changed to: print "package HTML::DTD; # \n\n"; # --pgm $intro =~ s/^[ \t]*/\# /gm; print "$intro\n\n"; # initialise and avoid sole-use warnings -pgm print "\@all_tags=()\;\n\@optional_start_tag =()\;\n\@empty=()\;\n"; print "\@optional_end_tag=()\;\n\@boolean_attr=()\;\n\%elem=()\;\n\n"; my @all_tags = sort keys %element; my @empty = (); my @optional_end_tag = (); my @optional_start_tag = (); for (@all_tags) { push(@empty, $_) if $element{$_}[2] eq 'empty'; push(@optional_end_tag, $_) if $element{$_}[2] ne 'empty' and $element{$_}[1] ne '-'; push(@optional_start_tag, $_) if $element{$_}[0] ne '-'; } print "\@all_tags = qw(@all_tags);\n"; print "\@empty = qw(@empty);\n"; print "\@optional_end_tag = qw(@optional_end_tag);\n"; print "\@optional_start_tag = qw(@optional_start_tag);\n"; print <<'EOT'; # The %elem hash is indexed by lowercase tag identifiers. Each element is # an anonymouse hash with the following values: # # 'content': Describes the content that can be present within this # element. This value is missing if the element should # always be empty. # # 'optend': True if the end tag for this element is optional # # 'attr': A hash that describes the attributes of this element. # Each element in this hash is a anonymous array with # two values: allowed values; default value # EOT print "\n%elem = (\n"; @boolean_attr = (); for (@all_tags) { my $e = $_; # # there are other keywords popping up. # play it more conservative and use quotes around (just about) everything. # WAS $e = "'$e'" if $e eq 'tr' || $e eq 'link' || $e eq 'sub'; # these are perl keywords # inserted single quotes next line # but that means we do away with nicer formatting --pgm # not printf "\'%-4s\' => {\n", $e; but print "\'$e\' => {\n"; print "\t content => '$element{$_}[2]',\n" if $element{$_}[2] ne 'empty'; print "\t optend => 1,\n" if $element{$_}[1] ne '-'; if (exists $attr{$_}) { print "\t attr => {\n"; for $a (sort keys %{$attr{$_}}) { my @a = @{$attr{$_}{$a}}; # added surrounding quotes to avoid syntax err, warnings on # hyphenated words and keywords # print "\t\t\t\'$a\' => [", join(",", map {qq("$_")} @a), "],\n" unless $a eq "%reserved"; # quotes here, too (e.g., select keyword overlap) push(@boolean_attr, "\'$_\'\t=> '$a'") if $a eq $attr{$_}{$a}[0]; } print "\t\t },\n"; } print "\t},\n"; } print ");\n"; print "\n\n\%boolean_attr = (\n"; for (@boolean_attr) { print " $_,\n"; } print ");\n"; print "\n1;\n"; exit; #----------------------------------------------------------------------- sub parse_attrs # Parse the content { my $a = shift; my %a = (); my $a2 = $a; #print "---$a---\n"; while ($a =~ /\S/) { $a =~ s/^\s*(\S+)\s*//; my $key = $1; my ($val, $default); if ($a =~ s/^\(([^\)]+)\)//) { $val = $1; $val =~ s/\s+//g; } elsif ($a =~ s/^(\S+)//) { $val = $1; } # problem was here with %reserved # and comment lines for reserved attrs... pgm # but not an issue if we drop them # but a bit more diagnostic info ? else { die "Missing value at end of:\n[$a2]\n-------\n"; } $val = lc($val) unless $val =~ /^[A-Z]+$/; $val =~ s/^"(.*)"$/$1/; $a =~ s/^\s+//; if ($a =~ s/^(\#FIXED\s+\'[^\']+\')//) { $default = $1; } elsif ($a =~ s/^(\S+)//) { $default = $1; } # same problem as for $val above on reserveds else { die "Missing default at end of:\n[$a2]\n------\n"; } $default = lc($default) unless $default =~ /^[\#\"]/; $default =~ s/^"(.*)"$/$1/; $a{$key} = [$val, $default]; } \%a; }