HTML Table Extractor

This PHP script parses “useful” information from HTML tables. It has been developed as an additional tool in the course of my master thesis at the Graz University of Technology.

Please refer to my master thesis for detailed information on this tool.

/**
 * HTMLTableExtractor
 *
 * Parses HTML tables and extracts "useful" information from HTML table.
 * Valid "keywords" can be identified by their markup: bold-face.
 * Note that keywords are only taken into consideration until special delimiters
 * are reached. Any further keywords markup will be ignored. Writes parsed
 * output in (parser) optimized format to file specified for main().
 *
 * main():
 * - Reads all files from source folder specified to main().
 * - Writes output to file specified to main().
 * - Writes log messages to log file specified to main().
 *
 * @filesource htmlTableExtractor.html
 * @author Matthias Kerstner info@kerstner.at
 * @version 1.6
 */

require_once 'simple_html_dom.php'; //http://sourceforge.net/projects/simplehtmldom/

set_time_limit(0); //remove timeout limit
ini_set('memory_limit', '5000M'); //increase buffer limit for simple_html_dom...

mb_internal_encoding('UTF-8');
mb_regex_encoding('UTF-8');

define('LANG_CNT', 2); //including source language

header('Content-type: text/html; charset=utf-8'); //for warnings, errors,...

/**
 * Converts given @param{$filename} into UTF-8 encoding and returns content.
 * @param <String> $filename
 * @return <String>
 */
function file_get_contents_utf8($filename) {
    $content = file_get_contents($filename);
    return mb_convert_encoding($content, 'UTF-8',
            mb_detect_encoding($content, 'UTF-8, ISO-8859-1', true));
}

/**
 * Recursively parses @param{$baseDir} for files to be processed
 * @param <String> $baseDir
 * @return <array> all files found in @param{$baseDir}
 */
function gatherSrcFiles($baseDir) {
    $list = array();

    if(!is_dir($baseDir))
        die('ERROR: baseDir "'.$baseDir.'"does not exist');

    $handle = null;
    if($handle = @opendir($baseDir)) {
        while(false !== ($file = readdir($handle))) {
            if($file != '.' && $file != '..')
                $list[] = $baseDir."/".$file;
        }
        closedir($handle);
    }
    return $list;
}

/**
 * Checks @param{$dstFile} for consistent translation entries. Errors
 * will be printed to stdout. Calculates (simple) statistical output.
 * @param string $dstFile
 */
function checkConsistency($dstFile, $logFile) {
    $fp = null;
    $fpLog = null;

    if(false === ($fp = @fopen($dstFile, 'r')))
        die("ERROR: Failed to open dstFile '".$dstFile."', aborting...");
    if(false === ($fpLog = @fopen($logFile, 'a+')))
        die("ERROR: Failed to open logFile '".$logFile."', aborting...");

    $errBuffer = '';
    $lineCnt = 0;
    $inconsistentCnt = 0;

    while(!feof($fp)) {
        $line = fgets($fp, 4096);

        if(!mb_ereg_match("[^\W]+ -> [^\W]+", $line)) { //check syntax
            $errBuffer .= "\nWARNING: invalid line found: '".$line."'";
            $inconsistentCnt += 1;
        }
        $lineCnt += 1;
    }

    fclose($fp);

    $buff = "Consistency-check statistics:\n".$inconsistentCnt." / ".$lineCnt;

    $percentage = 0;
    if($lineCnt > 0) //calculate percentage
        $percentage = round($inconsistentCnt / $lineCnt, 2) * 100;

    $buff .= ", ".$percentage."% inconsistent\n".$errBuffer;
    fwrite($fpLog, $buff);
    fclose($fpLog);
}

/**
 * Removes page header information from @param{$fileContent} and returns it.
 * @param <String> $fileContent
 * @return <String>
 */
function removePageHeader($fileContent) {
    $pattern = '<SPAN ID="Frame\d+" DIR="LTR" STYLE="float: left; '.
            'width: 7.74cm; height: 0.74cm; border: none; padding: 0cm; '.
            'background: #ffffff">\s*<P LANG="de-DE" CLASS="western" '.
            'ALIGN=CENTER( STYLE="margin-top: 0cm")*>\s*'.
            '(<FONT FACE="Verdana, sans-serif">)*(<FONT SIZE=2>)*\s*'.
            '[a-zA-ZäüöÄÜÖß]*\s*(–\s*[a-zA-ZäüöÄÜÖß]*)*\s*(</FONT>)*\s*(</P>)*'.
            '\s*</SPAN>';
    return mb_ereg_replace($pattern, '', $fileContent);
}

/**
 * Runs the HTML table extractor.
 * @param <String> $baseDir base folder containing files to be processed
 * @param <String> $dstFile destination path where output will be written to
 * @param <String> $logFile destination path where log output will be written to
 */
function main($baseDir, $dstFile, $logFile) {

    $fpDst = null;
    $fpLog = null;
    $formatted_src_lang = "";

    if(!defined('LANG_CNT'))
        die("ERROR: LANG_CNT not defined");
    if(false === ($fpDst = @fopen($dstFile, 'w')))
        die("ERROR: Failed to open dstFile '".$dstFile."', aborting...");
    if(false === ($fpLog = @fopen($logFile, 'w')))
        die("ERROR: Failed to open logFile '".$logFile."', aborting...");

    fwrite($fpDst, "<HTML><HEAD><META HTTP-EQUIV=\"CONTENT-TYPE\" ".
            "CONTENT=\"text/html; charset=utf-8\"></HEAD><BODY>");

    $files = gatherSrcFiles($baseDir);
    $delimiters = array(":", ",");   //valid delimiters for delimiting keyword-search

    foreach($files as $f) { //parse files in selection

        $fileContent = removePageHeader(file_get_contents($f));
        $html = str_get_html(file_get_contents($fileContent));

        foreach($html->find('tr') as $tr) { //parse translations

            $langs = $tr->getElementsByTagName('td');
            $langCnt = 0;

            foreach($langs as $td) { //parse translations

                if($langCnt >= (int)LANG_CNT) {
                    fwrite($fpLog, "WARNING: invalid syntax detected ".
                            "(trying to continue...)\n");
                    break; //invalid td-count, break loop to retain data consistency
                }

                $paragraphs = $td->getElementsByTagName('p');
                if(count($paragraphs) < 1) {
                    fwrite($fpLog, "WARNING: invalid p count detected ".
                            "(trying to continue...)\n");
                    break; //invalid td-count, break loop to retain data consistency
                }

                $p = $paragraphs[0]; //max 1 per td allowed (all others will be ignored)

                $plainUntilDelim = mb_ereg_replace("[\t]", "",
                        html_entity_decode(trim($p->plaintext), ENT_QUOTES, 'UTF-8'));

                $firstFound = mb_strlen($plainUntilDelim);
                foreach($delimiters as $d) { //find first delimiter position
                    $pos = mb_strpos($plainUntilDelim, $d);
                    if($pos !== false && $pos < $firstFound)
                        $firstFound = $pos;
                }

                if($firstFound >= 1) {
                    //parse plaintext until first delimiter found (or to the end if none found)
                    $plainUntilDelim = mb_substr($plainUntilDelim, 0, $firstFound);
                }

                //echo "<br><br>plain until delim=".$plainUntilDelim;

                $plainUntilDelimLen = mb_strlen($plainUntilDelim);
                $keywordParts = $p->getElementsByTagName('b');
                $keyword = '';
                $searchOffset = 0;
                $parsedKeywordParts = array();

                foreach($keywordParts as $v) {
                    //parse keyword(s) (all bold characters until first delimiter is found)
                    //remove special characters except delimiters (!)
                    $stripped = mb_ereg_replace("[\t]", "",
                            html_entity_decode(trim($v->plaintext), ENT_QUOTES, 'UTF-8'));

                    $firstFoundStripped = mb_strlen($stripped);
                    foreach($delimiters as $d) { //find first delimiter position
                        $pos = mb_strpos($stripped, $d);
                        if($pos !== false && $pos < $firstFoundStripped)
                            $firstFoundStripped = $pos;
                    }
                    if($firstFound >= 1) //extract plaintext until first delimiter
                        $stripped = mb_substr($stripped, 0, $firstFoundStripped);

                    $stripped = mb_ereg_replace("[,:]", "", trim($stripped)); //do not remove spaces here

                    if(!empty($stripped) && !in_array($stripped, $parsedKeywordParts)) {
                        //ignore empty tags and duplicates

                        //echo "<br>searching for ".$stripped . "(".mb_strlen($stripped).") with offset=".$searchOffset.", langCnt=".$langCnt." %2=".($langCnt%2)."<br";

                        $pos = mb_strpos($plainUntilDelim, $stripped, $searchOffset);

                        if($pos === false)
                            break;  //not found inside delimited plaintext -> break loop

                        if($pos > $searchOffset+1 && ($langCnt % 2 != 0))
                            $keyword .= " ".$stripped; //translation-td, insert space between keyword-parts
                        else
                            $keyword .= $stripped;

                        $parsedKeywordParts[] = $stripped;
                        $searchOffset = $pos; //set next iteration offset
                    }
                }

                $keyword = mb_ereg_replace("[\t\r\n]", " ", trim($keyword)); //strip any unnecessary chars

                if(empty($keyword))
                    fwrite($fpLog, "ERROR: could not determine keyword\n");

                if($langCnt % 2 != 0) { //used for pretty formatting in dstFile
                    fwrite($fpDst, "[".$keyword."]<br>{".$formatted_src_lang."} => {".$p->innertext."}<br><br>");
                } else { //src language
                    fwrite($fpDst, "[".$keyword."] => ");
                    $formatted_src_lang = $p->innertext;
                }

                $langCnt += 1;
            } //end parse translations
        } //end parse rows

        fwrite($fpDst, "<br><br><br>"); //start next file
        $html->clear(); //prevent memory-leak
        unset($html);
    } //end parse files

    fwrite($fpDst, "</BODY></HTML>");

    fclose($fpDst);
    fclose($fpLog);

    checkConsistency($dstFile, $logFile);

    echo "DONE!";
}

// start processing
main('/source/path', '/dst/path/to/output.html', '/dst/path/to/out.log');

You may also like...

Leave a Reply

Your email address will not be published. Required fields are marked *