article: find teaser paragraph from variable offset
authorMischa POSLAWSKY <perl@shiar.org>
Sun, 17 May 2020 03:17:38 +0000 (05:17 +0200)
committerMischa POSLAWSKY <perl@shiar.org>
Sun, 7 Jun 2020 05:44:46 +0000 (07:44 +0200)
Simplify matching by starting at optional first </h2>.
No longer ignores (short) preceding titles such as on Lijtweg home;
more reliable otherwise.

article.inc.php

index dd3d5b2996d777fafe929c593dd666b5c97a5033..dbc545f8df2fadf209f00f76d74190eeb288f505 100644 (file)
@@ -128,19 +128,12 @@ class ArchiveArticle
                        return $override;
                }
 
+               # paragraph contents following the page header if any
+               $offset = strpos($this->raw, '</h2>');
+               $offset = $offset ? $offset + 5 : 0;
                if (preg_match('{
-                       </h2> (?: \s+ | <p\sclass="nav\b.*?</p> | <div[^>]*> )* <p> \s* (.*?) </p>
-               }sx', $this->raw, $bodyp, PREG_OFFSET_CAPTURE)) {
-                       # fallback paragraph contents following the page header
-                       if ($bodyp[1][1] < 512) {
-                               return $bodyp[1][0];
-                       }
-               }
-
-               # starting paragraph for documents without title (assumed simple/partial)
-               if (strpos($this->raw, '<h2') === FALSE and preg_match('{
-                       \A (?: <div [^>]*> \s* )* <p> \s* (.*?) </p>
-               }sx', $this->raw, $bodyp)) {
+                       \G (?: \s+ | <p\sclass="nav\b.*?</p> | <div [^>]*> )* <p> \s* (.*?) </p>
+               }sx', $this->raw, $bodyp, 0, $offset)) {
                        return $bodyp[1];
                }
        }