;;; getphrases.xom ;;; Extract phrases from an XML or HTML file. ;;; This script is the first pass of a phrase indexer. ;;; Copyright (C) 1999 Academia Sinica, permission to use granted under GPL ;;; Created: Rick Jelliffe, ricko@gate.sinica.edu.tw, 1999-04-06 ;;; Declarations and initializations CROSS-TRANSLATE GLOBAL STREAM nearestID GLOBAL STREAM title GLOBAL STREAM title1 GLOBAL STREAM title2 GLOBAL STREAM infile FIND-START SET BUFFER nearestID TO "" SET BUFFER title1 TO "" SET BUFFER title TO "" ;;; The definition of a wordthing is arbitrary. ;;; Make the numbers bigger to catch fewer phrases. ;;; Words that look like "that" or "this" are not caught MACRO STARTWORDTHING IS ( WORD-START ( ( [LETTER OR DIGIT]{5}) | (UC UC) | (UC "h" ["eoi"] LETTER{3} ) | (UC (LOOKAHEAD NOT ( "h" ["eoi"])) LETTER{2} ) ) (["._@"]? [LETTER OR DIGIT])* ) MACRO-END MACRO WORDTHING IS ( WORD-START ( [LETTER OR DIGIT]{5} | (UC UC) | (UC "h" ["eoi"] LETTER{3} ) | (UC (LOOKAHEAD NOT ( "h" ["eoi"])) LETTER{2} ) ) ( ["._@"]? [LETTER OR DIGIT])* ) MACRO-END MACRO PHRASESPACE IS ( WHITE-SPACE OR "-" OR "'s" )+ MACRO-END ;;; Find some particular information FIND (""]+=theTag ">" SET BUFFER nearestID TO "" REPEAT SCAN PATTERN theTag MATCH UL "id=" ("'" | '"' ) [ANY EXCEPT "'%""]+=theID SET BUFFER nearestID to "#%x(theID)" MATCH ANY ; swallow AGAIN ;; for FAQs with no title DO WHEN LENGTH OF "%g(title)" IS EQUAL 0 SET BUFFER title TO "%g(infile)" DONE FIND ""]* ">" [ANY EXCEPT "<"]+=theTitle DO WHEN LENGTH OF "%g(title1)" IS EQUAL 0 OPEN title1 AS BUFFER REPEAT SCAN PATTERN theTitle MATCH VALUE-START WHITE-SPACE+ ; strip MATCH WHITE-SPACE+ VALUE-END ; strip MATCH WHITE-SPACE+ PUT title1 " " MATCH ANY=theChar PUT title1 "%x(theChar)" AGAIN CLOSE title1 SET BUFFER title TO "%g(title1)" ELSE OPEN title2 AS BUFFER REPEAT SCAN PATTERN theTitle MATCH VALUE-START WHITE-SPACE+ ; strip MATCH WHITE-SPACE+ VALUE-END ; strip MATCH WHITE-SPACE+ PUT title2 " " MATCH ANY=theChar PUT title2 "%x(theChar)" AGAIN CLOSE title2 SET BUFFER title TO "%g(title1)::%g(title2)" DONE FIND ""]* ">" [ANY EXCEPT "<"]+=theName OUTPUT "%52fux(theName)%x(theName)
  in %g(title)%n" FIND ""]* ">" [ANY EXCEPT "<"]+ ; strip ;;; Skip over tags FIND "") ANY)+ "-->" FIND ""]+ ">" ;; no internal subset FIND "" ;; internal subset FIND "") ANY)+ "?>" FIND "<" [ANY EXCEPT ">"]+ ">" FIND ( STARTWORDTHING=thing1 PHRASESPACE=sp1 WORDTHING=thing2 ( PHRASESPACE=sp2 WORDTHING=thing3 ( PHRASESPACE=sp3 WORDTHING=thing4 ( PHRASESPACE=sp4 WORDTHING=thing5 ( PHRASESPACE=sp5 WORDTHING=thing6 ( PHRASESPACE=sp6 WORDTHING=thing7 ( PHRASESPACE=sp7 WORDTHING=thing8 )? )? )? )? )? )? )=thePhrase LOCAL STREAM buf1 LOCAL STREAM buf2 LOCAL STREAM buf3 LOCAL STREAM buf4 LOCAL STREAM buf5 LOCAL STREAM buf6 LOCAL STREAM buf7 ;; get rid of newlines OPEN buf1 AS BUFFER REPEAT SCAN PATTERN sp1 MATCH WHITE-SPACE+ PUT buf1 " " MATCH ANY=theChar PUT buf1 "%x(theChar)" AGAIN CLOSE buf1 OPEN buf2 AS BUFFER REPEAT SCAN PATTERN sp2 MATCH WHITE-SPACE+ PUT buf2 " " MATCH ANY=theChar PUT buf2 "%x(theChar)" AGAIN WHEN PATTERN sp2 IS SPECIFIED CLOSE buf2 OPEN buf3 AS BUFFER REPEAT SCAN PATTERN sp3 MATCH WHITE-SPACE+ PUT buf3 " " MATCH ANY=theChar PUT buf3 "%x(theChar)" AGAIN WHEN PATTERN sp3 IS SPECIFIED CLOSE buf3 OPEN buf4 AS BUFFER REPEAT SCAN PATTERN sp4 MATCH WHITE-SPACE+ PUT buf4 " " MATCH ANY=theChar PUT buf4 "%x(theChar)" AGAIN WHEN PATTERN sp4 IS SPECIFIED CLOSE buf4 OPEN buf5 AS BUFFER REPEAT SCAN PATTERN sp5 MATCH WHITE-SPACE+ PUT buf5 " " MATCH ANY=theChar PUT buf5 "%x(theChar)" AGAIN WHEN PATTERN sp5 IS SPECIFIED CLOSE buf5 OPEN buf6 AS BUFFER REPEAT SCAN PATTERN sp6 MATCH WHITE-SPACE+ PUT buf6 " " MATCH ANY=theChar PUT buf6 "%x(theChar)" AGAIN WHEN PATTERN sp6 IS SPECIFIED CLOSE buf6 OPEN buf7 AS BUFFER REPEAT SCAN PATTERN sp7 MATCH WHITE-SPACE+ PUT buf7 " " MATCH ANY=theChar PUT buf7 "%x(theChar)" AGAIN WHEN PATTERN sp7 IS SPECIFIED CLOSE buf7 ;; output two-word phrases, unless it is the start of a three word phrase OUTPUT "%20fux(thing2) %20fux(thing1) __________"_ "%x(thing2), %x(thing1)%g(buf1)
  in %g(title)%n" UNLESS PATTERN thing3 IS SPECIFIED OUTPUT "%20fux(thing1) %20fux(thing2) __________"_ "%x(thing1)%g(buf1)%x(thing2)
  in %g(title)%n" UNLESS PATTERN thing3 IS SPECIFIED OUTPUT "%20fux(thing2) %20fux(thing3) __________"_ "%x(thing2)%g(buf2)%x(thing3), %x(thing1)
  in %g(title)%n" WHEN PATTERN thing3 IS SPECIFIED AND PATTERN thing4 ISNT SPECIFIED OUTPUT "%20fux(thing3) %20fux(thing4) __________"_ "%x(thing3)%g(buf3)%x(thing4), %x(thing1) %x(thing2)
  in %g(title)%n" WHEN PATTERN thing4 IS SPECIFIED AND PATTERN thing5 ISNT SPECIFIED OUTPUT "%20fux(thing4) %20fux(thing5) __________"_ "%x(thing4)%g(buf4)%x(thing5), %x(thing2) %x(thing3)
  in %g(title)%n" WHEN PATTERN thing5 IS SPECIFIED AND PATTERN thing6 ISNT SPECIFIED OUTPUT "%20fux(thing5) %20fux(thing6) __________"_ "%x(thing5)%g(buf5)%x(thing6), %x(thing3) %x(thing4)
  in %g(title)%n" WHEN PATTERN thing6 IS SPECIFIED AND PATTERN thing7 ISNT SPECIFIED OUTPUT "%20fux(thing6) %20fux(thing7) __________"_ "%x(thing6)%g(buf6)%x(thing7), %x(thing4) %x(thing5)
  in %g(title)%n" WHEN PATTERN thing7 IS SPECIFIED AND PATTERN thing8 ISNT SPECIFIED OUTPUT "%20fux(thing7) %20fux(thing8) __________"_ "%x(thing7)%g(buf7)%x(thing8), %x(thing5) %x(thing6)
  in %g(title)%n" WHEN PATTERN thing8 IS SPECIFIED ;; Handle three-word phrases DO WHEN PATTERN thing3 IS SPECIFIED OUTPUT "%20fux(thing2) %20fux(thing3) %10fux(thing1)"_ "%x(thing2)%g(buf2)%x(thing3), %x(thing1)%g(buf1)" OUTPUT "%g(buf3)%x(thing4)" WHEN PATTERN thing4 IS SPECIFIED OUTPUT "
  in %g(title)%n" DONE DO WHEN PATTERN thing3 IS SPECIFIED OUTPUT "%20fux(thing3) %20fux(thing1) %10fux(thing2)"_ "%x(thing3), %x(thing1)%g(buf1)%x(thing2)%g(buf2)" OUTPUT "%g(buf3)%x(thing4)" WHEN PATTERN thing4 IS SPECIFIED OUTPUT "
  in %g(title)%n" DONE DO WHEN PATTERN thing3 IS SPECIFIED OUTPUT "%20fux(thing1) %20fux(thing2) %10fux(thing3)"_ "%x(thing1)%g(buf1)%x(thing2)%g(buf2)%x(thing3)" OUTPUT "%g(buf3)%x(thing4)" WHEN PATTERN thing4 IS SPECIFIED OUTPUT "
  in %g(title)%n" DONE OUTPUT "%20fux(thing2) %20fux(thing3) %10fux(thing4)"_ "%x(thing2)%g(buf2)%x(thing3)%g(buf3)%x(thing4), %x(thing1)
  in %g(title)%n" WHEN PATTERN thing4 IS SPECIFIED OUTPUT "%20fux(thing3) %20fux(thing4) %10fux(thing5)"_ "%x(thing3)%g(buf3)%x(thing4)%g(buf4)%x(thing5), %x(thing1) %x(thing2)
  in %g(title)%n" WHEN PATTERN thing5 IS SPECIFIED OUTPUT "%20fux(thing4) %20fux(thing5) %10fux(thing6)"_ "%x(thing4)%g(buf4)%x(thing5)%g(buf5)%x(thing6), %x(thing2) %x(thing3)
  in %g(title)%n" WHEN PATTERN thing6 IS SPECIFIED OUTPUT "%20fux(thing5) %20fux(thing6) %10fux(thing7)"_ "%x(thing5)%g(buf5)%x(thing6)%g(buf6)%x(thing7), %x(thing3) %x(thing4)
  in %g(title)%n" WHEN PATTERN thing7 IS SPECIFIED OUTPUT "%20fux(thing6) %20fux(thing7) %10fux(thing8)"_ "%x(thing6)%g(buf6)%x(thing7)%g(buf7)%x(thing8), %x(thing4) %x(thing5)
  in %g(title)%n" WHEN PATTERN thing8 IS SPECIFIED ;;; otherwise, dont index the text FIND ANY ; swallow it