Add 'mojo-useragent-bibsonomy.pl'
This commit is contained in:
867
mojo-useragent-bibsonomy.pl
Normal file
867
mojo-useragent-bibsonomy.pl
Normal file
@@ -0,0 +1,867 @@
|
|||||||
|
my $license = <<"LICENSE";
|
||||||
|
MIT License
|
||||||
|
|
||||||
|
Copyright (c) 2021 Marlon Maschkiwitz
|
||||||
|
|
||||||
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
|
of this software and associated documentation files (the "Software"), to deal
|
||||||
|
in the Software without restriction, including without limitation the rights
|
||||||
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
copies of the Software, and to permit persons to whom the Software is
|
||||||
|
furnished to do so, subject to the following conditions:
|
||||||
|
|
||||||
|
The above copyright notice and this permission notice shall be included in all
|
||||||
|
copies or substantial portions of the Software.
|
||||||
|
|
||||||
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||||
|
SOFTWARE.
|
||||||
|
LICENSE
|
||||||
|
|
||||||
|
#!/usr/bin/env perl
|
||||||
|
use strict;
|
||||||
|
use warnings;
|
||||||
|
use open qw(:locale);
|
||||||
|
use feature qw(say switch);
|
||||||
|
use POSIX qw(strftime);
|
||||||
|
use Data::Validate::URI qw(is_web_uri);
|
||||||
|
use File::Path qw(make_path);
|
||||||
|
use List::Util qw( min max uniq );
|
||||||
|
use File::Copy qw(copy);
|
||||||
|
use utf8;
|
||||||
|
use Benchmark;
|
||||||
|
use Data::Dumper;
|
||||||
|
use URI;
|
||||||
|
use Getopt::Long;
|
||||||
|
use File::Basename;
|
||||||
|
use LWP::UserAgent;
|
||||||
|
use DBIx::Connector;
|
||||||
|
use URI::Encode;
|
||||||
|
use Search::Sitemap::Index;
|
||||||
|
use Search::Sitemap;
|
||||||
|
|
||||||
|
#
|
||||||
|
# Global VARS
|
||||||
|
#
|
||||||
|
my (%uniq, $ua, %data, $connection, $reached_link_limit);
|
||||||
|
my $actionNr = 1;
|
||||||
|
my %config = (
|
||||||
|
'MYSQL' => {
|
||||||
|
'ACCESS' => {
|
||||||
|
'database' => 'bibsonomy',
|
||||||
|
'hostname' => '127.0.0.1',
|
||||||
|
'user' => 'mma',
|
||||||
|
'pw' => 'BP5ZDavtZN',
|
||||||
|
'port' => 3306,
|
||||||
|
},
|
||||||
|
'OPTIONS' => {
|
||||||
|
'PrintError' => 0,
|
||||||
|
'RaiseError' => 1,
|
||||||
|
'AutoCommit' => 1,
|
||||||
|
'Reconnect' => 1,
|
||||||
|
'LimitPerQuery' => 10000,
|
||||||
|
},
|
||||||
|
'BIBSONOMY' => {
|
||||||
|
"publicationsUser" => {
|
||||||
|
"query" => "SELECT simhash1, user_name FROM bibtex b JOIN user u USING (user_name) WHERE b.group=0 AND u.spammer=0 AND u.to_classify=0 AND u.updated_by != 'classifier' AND u.updated_by IS NOT NULL AND b.user_name NOT IN ('dblp', 'genealogie') ORDER BY simhash1 LIMIT %d OFFSET %d",
|
||||||
|
"id_key" => "simhash1",
|
||||||
|
"prepend" => "publication",
|
||||||
|
},
|
||||||
|
"publicationsPublic" => {
|
||||||
|
"query" => "SELECT simhash1, user_name FROM bibtex b JOIN user u USING (user_name) WHERE b.group=0 AND u.spammer=0 AND u.to_classify=0 AND u.updated_by != 'classifier' AND u.updated_by IS NOT NULL AND b.user_name NOT IN ('dblp', 'genealogie') ORDER BY simhash1 LIMIT %d OFFSET %d;",
|
||||||
|
"id_key" => "simhash1",
|
||||||
|
"prepend" => "publication",
|
||||||
|
},
|
||||||
|
"groups" => {
|
||||||
|
"query" => "SELECT group_name from groupids g WHERE g.group > 2 LIMIT %d OFFSET %d;",
|
||||||
|
"id_key" => "group_name",
|
||||||
|
"prepend" => "group",
|
||||||
|
},
|
||||||
|
"users" => {
|
||||||
|
"query" => "SELECT user_name FROM user u WHERE u.spammer=0 AND u.to_classify=0 AND u.updated_by != 'classifier' AND u.updated_by IS NOT NULL AND u.user_name NOT IN ('dblp', 'genealogie') LIMIT %d OFFSET %d;",
|
||||||
|
"id_key" => "user_name",
|
||||||
|
"prepend" => "user",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
'PING' => {
|
||||||
|
'GOOGLE' => {
|
||||||
|
'URL' => 'http://www.google.com/ping?sitemap=',
|
||||||
|
},
|
||||||
|
},
|
||||||
|
'GetOpts' => {
|
||||||
|
'URL' => 'https://bibsonomy.org',
|
||||||
|
'ALLOWED_RESPONSES' => [],
|
||||||
|
'LINK_LIMIT' => undef,
|
||||||
|
'LINK_DEPTH' => 3,
|
||||||
|
'SITEMAP_LIMIT' => 50000,
|
||||||
|
'SITEMAP_COMPRESS' => 1,
|
||||||
|
'OUTPUT_LINKS' => 0,
|
||||||
|
'LINK_REGEX' => [],
|
||||||
|
'SITEMAP_ROOT' => undef,
|
||||||
|
'SITEMAP_SUBMIT' => 0,
|
||||||
|
'VERBOSE_LEVEL' => 0,
|
||||||
|
'OUTPUT_CONFIG' => 0,
|
||||||
|
'OUTPUT_DATA' => 0,
|
||||||
|
'CHECK_UNIQUE' => 1,
|
||||||
|
'CHECK_DEPTH' => 0,
|
||||||
|
'CHECK_RESPONSES' => 0,
|
||||||
|
'CHECK_REGEX' => 0,
|
||||||
|
},
|
||||||
|
'urlChecks' => {
|
||||||
|
'CHECK_UNIQUE' => \&checkLinkUnique,
|
||||||
|
'LINK_DEPTH' => \&checkLinkDepth,
|
||||||
|
'CHECK_REGEX' => \&checkLinkRegex,
|
||||||
|
'CHECK_RESPONSES' => \&checkLinkResponse,
|
||||||
|
},
|
||||||
|
"staticLinks" => ['gettingStarted','help','groups','popular','tags','persons'],
|
||||||
|
);
|
||||||
|
my ($getOpts, $constants, $mySQLAccess, $mySQLOptions, $mySQLBibsonomy) = ($config{'GetOpts'}, $config{'CONSTANTS'}, $config{'MYSQL'}{'ACCESS'}, $config{'MYSQL'}{'OPTIONS'}, $config{'MYSQL'}{'BIBSONOMY'});
|
||||||
|
|
||||||
|
|
||||||
|
#
|
||||||
|
# Workflow
|
||||||
|
#
|
||||||
|
&startBenchmarking();
|
||||||
|
&getCommandFlags();
|
||||||
|
&checkUrlArg();
|
||||||
|
&checkSitemapRootArg();
|
||||||
|
&checkResponsesArg();
|
||||||
|
&setConfigConstants();
|
||||||
|
&checkSitemapSubmitArg();
|
||||||
|
&initCrawling();
|
||||||
|
&canCopy();
|
||||||
|
&printScriptHeaders();
|
||||||
|
&runCrawlerBibsonomy();
|
||||||
|
&generateSitemaps();
|
||||||
|
&submitSitemaps();
|
||||||
|
&finishRun();
|
||||||
|
|
||||||
|
|
||||||
|
#
|
||||||
|
# Initialization methods
|
||||||
|
#
|
||||||
|
sub initCrawling {
|
||||||
|
&Delimiter((caller(0))[3], 4);
|
||||||
|
my $runDir = $constants->{'RUN_DIR'};
|
||||||
|
|
||||||
|
# REGEX ARGS
|
||||||
|
if (@{$getOpts->{'LINK_REGEX'}}) {
|
||||||
|
$getOpts->{'CHECK_REGEX'} = 1;
|
||||||
|
for my $i ( 0 .. scalar @{$getOpts->{'LINK_REGEX'}}-1 ) {
|
||||||
|
$getOpts->{LINK_REGEX}[$i] = qr/$getOpts->{LINK_REGEX}[$i]/;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
$connection = connectToMySql($mySQLAccess->{'database'});
|
||||||
|
&logAssert("Creating run directory: $constants->{'RUN_DIR'}", 3);
|
||||||
|
make_path($constants->{'RUN_DIR'}, { chmod => 0777 });
|
||||||
|
removeDuplicateValues();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
sub logAssert {
|
||||||
|
my ($message, $verboseLevel) = @_;
|
||||||
|
say $message if $verboseLevel <= $getOpts->{'VERBOSE_LEVEL'};
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
sub getCommandFlags {
|
||||||
|
&Delimiter((caller(0))[3], 4);
|
||||||
|
GetOptions(
|
||||||
|
'url=s' => \$getOpts->{'URL'},
|
||||||
|
'responses=s' => \@{$getOpts->{'ALLOWED_RESPONSES'}},
|
||||||
|
'link-limit=i' => \$getOpts->{'LINK_LIMIT'},
|
||||||
|
'link-depth=i' => \$getOpts->{'LINK_DEPTH'},
|
||||||
|
'sitemap-limit=i' => \$getOpts->{'SITEMAP_LIMIT'},
|
||||||
|
'compress!' => \$getOpts->{'SITEMAP_COMPRESS'},
|
||||||
|
'output-links' => \$getOpts->{'OUTPUT_LINKS'},
|
||||||
|
'link-regex=s' => \@{$getOpts->{'LINK_REGEX'}},
|
||||||
|
'sitemap-root=s' => \$getOpts->{'SITEMAP_ROOT'},
|
||||||
|
'sitemap-submit' => \$getOpts->{'SITEMAP_SUBMIT'},
|
||||||
|
'output-config' => \$getOpts->{'OUTPUT_CONFIG'},
|
||||||
|
'output-data' => \$getOpts->{'OUTPUT_DATA'},
|
||||||
|
'verbose-level=i' => \$getOpts->{'VERBOSE_LEVEL'},
|
||||||
|
'check-unique!' => \$getOpts->{'CHECK_UNIQUE'},
|
||||||
|
'check-depth!' => \$getOpts->{'CHECK_DEPTH'},
|
||||||
|
'check-responses!' => \$getOpts->{'CHECK_RESPONSES'},
|
||||||
|
) or die;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
sub checkUrlArg() {
|
||||||
|
&Delimiter((caller(0))[3], 4);
|
||||||
|
return if !$getOpts->{'URL'};
|
||||||
|
die "--url \"$getOpts->{'URL'}\" is not a valid web uri. Make sure it starts with http or https." if !is_web_uri($getOpts->{'URL'});
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
sub checkResponsesArg {
|
||||||
|
&Delimiter((caller(0))[3], 4);
|
||||||
|
return if !@{$getOpts->{'ALLOWED_RESPONSES'}} && !$getOpts->{'CHECK_RESPONSES'};
|
||||||
|
die "You must use --responses <int> together with --check-responses" if !@{$getOpts->{'ALLOWED_RESPONSES'}} || !$getOpts->{'CHECK_RESPONSES'};
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
sub checkSitemapRootArg {
|
||||||
|
&Delimiter((caller(0))[3], 4);
|
||||||
|
return if !$getOpts->{'SITEMAP_ROOT'};
|
||||||
|
die "--sitemap-root \"$getOpts->{'SITEMAP_ROOT'}\" does not exist: $!" if (!-d $getOpts->{'SITEMAP_ROOT'});
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
sub checkSitemapSubmitArg {
|
||||||
|
&Delimiter((caller(0))[3], 4);
|
||||||
|
return if !$getOpts->{'SITEMAP_ROOT'} && !$getOpts->{'SITEMAP_SUBMIT'};
|
||||||
|
|
||||||
|
if( $getOpts->{'SITEMAP_SUBMIT'} && !$getOpts->{'SITEMAP_ROOT'} ) {
|
||||||
|
die "When using --sitemap-submit you need also set --sitemap-root, which is the absolute path of the websites root folder.";
|
||||||
|
}elsif( $getOpts->{'SITEMAP_SUBMIT'} && $getOpts->{'SITEMAP_ROOT'} ) {
|
||||||
|
die "--sitemap-root \"$getOpts->{'SITEMAP_ROOT'}\" does not exist." if (!-e $getOpts->{'SITEMAP_ROOT'});
|
||||||
|
die "--sitemap-root \"$getOpts->{'SITEMAP_ROOT'}\" is not a directory." if (!-d $getOpts->{'SITEMAP_ROOT'});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
sub removeDuplicateValues {
|
||||||
|
&Delimiter((caller(0))[3], 4);
|
||||||
|
foreach(['ALLOWED_RESPONSES', 'LINK_REGEX']) {
|
||||||
|
@{$getOpts->{$_}} = uniq(@{$getOpts->{$_}});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
sub setConfigConstants {
|
||||||
|
&Delimiter((caller(0))[3], 4);
|
||||||
|
|
||||||
|
my $u = URI->new( $getOpts->{'URL'} );
|
||||||
|
($data{'URL'}{'SITEMAP_ROOT'} = $u->canonical->scheme . '://' . $u->canonical->host . $u->canonical->path) =~ s/\/$//;
|
||||||
|
$constants->{'URL_HOST'} = remove_www(URI->new($getOpts->{'URL'})->canonical->host );
|
||||||
|
$constants->{'TIMESTAMP'} = GetTimestamp('YMDHMS');
|
||||||
|
$constants->{'RUN_DIR'} = dirname(__FILE__) . '/run/' . $constants->{'TIMESTAMP'};
|
||||||
|
$constants->{'SITEMAP_EXT'} = '.xml.gz' if $getOpts->{'SITEMAP_COMPRESS'};
|
||||||
|
$constants->{'SITEMAP_EXT'} = '.xml' if !$getOpts->{'SITEMAP_COMPRESS'};
|
||||||
|
$constants->{'SITEMAP_FILE'} = $constants->{'RUN_DIR'} . '/sitemap' . $constants->{'SITEMAP_EXT'};
|
||||||
|
$constants->{'SITEMAP_INDEX_FILE'} = $constants->{'RUN_DIR'} . '/sitemap-index' . $constants->{'SITEMAP_EXT'};
|
||||||
|
$constants->{'OUPUT_FILE'} = $constants->{'RUN_DIR'} . '/links';
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
#
|
||||||
|
# BibSonomy Methods
|
||||||
|
#
|
||||||
|
sub getBibsonomyUrls {
|
||||||
|
&Delimiter((caller(0))[3], 4);
|
||||||
|
|
||||||
|
foreach my $key (keys %{$mySQLBibsonomy}) {
|
||||||
|
my $offset = 0;
|
||||||
|
printActionNrString("Querying $key......");
|
||||||
|
while ( my %offset = %{ db_query(sprintf($mySQLBibsonomy->{$key}{"query"}, $mySQLOptions->{'LimitPerQuery'}, $offset),$connection,'selectall_hashref',$mySQLBibsonomy->{$key}{"id_key"})}) {
|
||||||
|
return if is_link_limit();
|
||||||
|
$offset += $mySQLOptions->{'LimitPerQuery'};
|
||||||
|
my $valid = createBibsonomyUrls(\%offset, $key);
|
||||||
|
@{$data{'LINKS'}{200}} = (@{$data{'LINKS'}{200}}, @{$valid});
|
||||||
|
&logAssert(('-> ' . scalar @{$data{'LINKS'}{200}}), 2);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
sub createBibsonomyUrls {
|
||||||
|
&Delimiter((caller(0))[3], 4);
|
||||||
|
my ($results_ref, $sqlKey) = @_;
|
||||||
|
my @valid;
|
||||||
|
my $prepend = "$data{'URL'}{'SITEMAP_ROOT'}/$mySQLBibsonomy->{$sqlKey}{'prepend'}";
|
||||||
|
|
||||||
|
foreach my $key ( keys %{$results_ref} ) {
|
||||||
|
my $link_string = "$prepend/$key";
|
||||||
|
$link_string .= "/$$results_ref{$key}{'user_name'}" if $sqlKey eq "publicationsUser";
|
||||||
|
push(@valid, $link_string) if &is_valid_link(URI->new($link_string));
|
||||||
|
}
|
||||||
|
|
||||||
|
return \@valid;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
sub runCrawlerBibsonomy {
|
||||||
|
&Delimiter((caller(0))[3], 4);
|
||||||
|
printActionNrString("Pushing static links...");
|
||||||
|
@{$data{'LINKS'}{200}} = map { "$data{'URL'}{'SITEMAP_ROOT'}/$_" } (@{$config{'staticLinks'}});
|
||||||
|
&logAssert(('-> ' . scalar @{$data{'LINKS'}{200}}, 2));
|
||||||
|
getBibsonomyUrls();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
#
|
||||||
|
# Sitemap Methods
|
||||||
|
#
|
||||||
|
sub submitSitemaps {
|
||||||
|
&Delimiter((caller(0))[3], 4);
|
||||||
|
my $sitemapUrls = copySitemaps() if $getOpts->{'SITEMAP_ROOT'};
|
||||||
|
return if !$getOpts->{'SITEMAP_SUBMIT'};
|
||||||
|
pingSearchEngine($sitemapUrls);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
sub copySitemaps {
|
||||||
|
&Delimiter((caller(0))[3], 4);
|
||||||
|
my @sitemapUrls;
|
||||||
|
|
||||||
|
foreach my $i (0 .. scalar @{$data{'SITEMAPS'}}-1 ) {
|
||||||
|
my ($file,$dir,$ext) = fileparse($data{'SITEMAPS'}[$i], qr/\.[^.]*/);
|
||||||
|
my $new_location = $getOpts->{'SITEMAP_ROOT'} . '/' . $file . $ext;
|
||||||
|
my $sitemap_url = $data{'URL'}{'SITEMAP_ROOT'} . '/' . $file . $ext;
|
||||||
|
push( @sitemapUrls, $sitemap_url );
|
||||||
|
copy $data{'SITEMAPS'}[$i], $new_location or warn "Copy failed: $new_location $!";
|
||||||
|
}
|
||||||
|
|
||||||
|
return \@sitemapUrls;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
sub pingSearchEngine {
|
||||||
|
&Delimiter((caller(0))[3], 4);
|
||||||
|
my $sitemapUrls_ref = shift;
|
||||||
|
my $uri = URI::Encode->new( { encode_reserved => 0 } );
|
||||||
|
|
||||||
|
for my $url (@{$sitemapUrls_ref}) {
|
||||||
|
for my $engine( keys %{$config{'PING'}} ) {
|
||||||
|
my $ping_url = $config{'PING'}{$engine}{'URL'} . $uri->encode($url);
|
||||||
|
&logAssert("\tSubmitting to $engine: $url -> $ping_url", 3);
|
||||||
|
my $code = &getStatusCode($ping_url);
|
||||||
|
&logAssert("\t$code", 2);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
sub printActionNrString{
|
||||||
|
&Delimiter((caller(0))[3], 4);
|
||||||
|
my $message = shift;
|
||||||
|
return if $getOpts->{'VERBOSE_LEVEL'} < 1;
|
||||||
|
say "$actionNr) $message";
|
||||||
|
$actionNr++;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
sub createSitemap {
|
||||||
|
&Delimiter((caller(0))[3], 4);
|
||||||
|
my ($index, $sitemap) = @_;
|
||||||
|
my ($file,$dir,$ext) = fileparse($constants->{'SITEMAP_FILE'}, qr/\.[^.]*/);
|
||||||
|
my $output_file = sprintf("%ssitemap_%d%s", $dir,$index, $constants->{'SITEMAP_EXT'});
|
||||||
|
printActionNrString("Writing $output_file...");
|
||||||
|
$sitemap->write($output_file);
|
||||||
|
push( @{$data{'SITEMAPS'}}, $output_file );
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
sub addToSitemapIndex {
|
||||||
|
&Delimiter((caller(0))[3], 4);
|
||||||
|
my ($index, $indexSitemap) = @_;
|
||||||
|
my $sitemap_file = sprintf("%s/sitemap_%d%s", $data{'URL'}{'SITEMAP_ROOT'}, $index, $constants->{'SITEMAP_EXT'});
|
||||||
|
$indexSitemap->add(Search::Sitemap::URL->new(loc => $sitemap_file));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
sub generateSitemaps {
|
||||||
|
&Delimiter((caller(0))[3], 4);
|
||||||
|
my $indexSitemap = Search::Sitemap::Index->new('pretty' => 1);
|
||||||
|
my $sitemap = Search::Sitemap->new(pretty => 1);
|
||||||
|
my ($i, $index) = (0, 0);
|
||||||
|
|
||||||
|
foreach my $url ( @{$data{'LINKS'}{'200'}} ) {
|
||||||
|
$sitemap->add(Search::Sitemap::URL->new(loc => $url));
|
||||||
|
$i++;
|
||||||
|
if( $i == $getOpts->{'SITEMAP_LIMIT'} ) {
|
||||||
|
$index++;
|
||||||
|
$i = 0;
|
||||||
|
createSitemap($index, $sitemap);
|
||||||
|
addToSitemapIndex($index, $indexSitemap);
|
||||||
|
$sitemap = Search::Sitemap->new(pretty => 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
writeSitemapFiles($sitemap, $indexSitemap, $index, $i);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
sub writeSitemapFiles {
|
||||||
|
&Delimiter((caller(0))[3], 4);
|
||||||
|
my ($sitemap, $indexSitemap, $index, $i) = @_;
|
||||||
|
|
||||||
|
if( $i < $getOpts->{'SITEMAP_LIMIT'} && $index == 0 ) {
|
||||||
|
$sitemap->write( $constants->{'SITEMAP_FILE'} );
|
||||||
|
push( @{$data{'SITEMAPS'}}, $constants->{'SITEMAP_FILE'} );
|
||||||
|
}elsif( $i < $getOpts->{'SITEMAP_LIMIT'} && $index > 0 && $i > 0) {
|
||||||
|
$index++;
|
||||||
|
createSitemap($index, $sitemap);
|
||||||
|
addToSitemapIndex($index, $indexSitemap);
|
||||||
|
createSitemapIndex($indexSitemap);
|
||||||
|
}elsif( $i < $getOpts->{'SITEMAP_LIMIT'} && $index > 0 && $i == 0) {
|
||||||
|
createSitemapIndex($indexSitemap);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
sub createSitemapIndex {
|
||||||
|
&Delimiter((caller(0))[3], 4);
|
||||||
|
my $indexSitemap = shift;
|
||||||
|
printActionNrString("Writing $constants->{'SITEMAP_INDEX_FILE'}...");
|
||||||
|
$indexSitemap->write( $constants->{'SITEMAP_INDEX_FILE'} );
|
||||||
|
push( @{$data{'SITEMAPS'}}, $constants->{'SITEMAP_INDEX_FILE'} );
|
||||||
|
}
|
||||||
|
|
||||||
|
#
|
||||||
|
# URL validate Methods
|
||||||
|
#
|
||||||
|
sub is_valid_link {
|
||||||
|
&Delimiter((caller(0))[3], 4);
|
||||||
|
my ($link) = @_;
|
||||||
|
my $link_depth = () = $link->path =~ /\/.+?/g if $getOpts->{'LINK_DEPTH'};
|
||||||
|
my $link_host = &remove_www($link->host);
|
||||||
|
&logAssert("-> $link", 3);
|
||||||
|
my $code = getStatusCode($link->canonical->as_string) if $getOpts->{'CHECK_RESPONSES'};
|
||||||
|
|
||||||
|
for my $key (keys %{$config{'urlChecks'}}) {
|
||||||
|
next if !$getOpts->{$key};
|
||||||
|
return 0 if !$config{'urlChecks'}{$key}->($link->canonical->as_string, $link->scheme, $link_depth, $link_host, $code);
|
||||||
|
}
|
||||||
|
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
sub checkLinkUnique {
|
||||||
|
&Delimiter((caller(0))[3], 4);
|
||||||
|
my ($link_string, $link_protocol, $link_depth, $link_host) = @_;
|
||||||
|
|
||||||
|
if ( ++$uniq{$link_string} > 1 ) {
|
||||||
|
&logAssert("\tFAIL (UNIQ)", 3);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
sub checkLinkDepth {
|
||||||
|
&Delimiter((caller(0))[3], 4);
|
||||||
|
my ($link_string, $link_protocol, $link_depth, $link_host) = @_;
|
||||||
|
|
||||||
|
if ($link_depth > $getOpts->{'LINK_DEPTH'}) {
|
||||||
|
&logAssert("\tFAIL (DEPTH $link_depth)", 3);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
sub checkLinkRegex {
|
||||||
|
&Delimiter((caller(0))[3], 4);
|
||||||
|
my ($link_string, $link_protocol, $link_depth, $link_host) = @_;
|
||||||
|
|
||||||
|
if(!&check_multiple_regex($link_string) ) {
|
||||||
|
&logAssert("\tFAIL (REGEX)", 3);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
sub getStatusCode {
|
||||||
|
&Delimiter((caller(0))[3], 4);
|
||||||
|
my ($link_string, $link_protocol, $link_depth, $link_host) = @_;
|
||||||
|
my $uaLWP = new LWP::UserAgent;
|
||||||
|
my $response = $uaLWP->get($link_string);
|
||||||
|
return $response->code;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
sub checkLinkResponse {
|
||||||
|
&Delimiter((caller(0))[3], 4);
|
||||||
|
my ($link_string, $link_protocol, $link_depth, $link_host, $code) = @_;
|
||||||
|
|
||||||
|
if (!isInArray($code, $getOpts->{'ALLOWED_RESPONSES'})) {
|
||||||
|
&logAssert("\tFAIL (Status-Code not allowed: $code)", 3);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
#
|
||||||
|
# Finish Methods
|
||||||
|
#
|
||||||
|
sub calcMinMaxAvg {
|
||||||
|
return sprintf("URL lengths: min=%d, max=%d, avg=%d", get_min_max_avg() ) if $getOpts->{'VERBOSE_LEVEL'} >= 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
sub calcRuntime {
|
||||||
|
return 'Runtime: ', &get_runtime($data{'STATISTICS'}{'TIME'}{'START'}, time()) if $getOpts->{'VERBOSE_LEVEL'} >= 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
sub endBenchmark {
|
||||||
|
return if $getOpts->{'VERBOSE_LEVEL'} < 1;
|
||||||
|
my $t1 = Benchmark->new;
|
||||||
|
my $td = timediff($t1, $data{'STATISTICS'}{'BENCHMARK_START'});
|
||||||
|
return "Benchmark: ", timestr($td);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
sub runCompleted {
|
||||||
|
return if $getOpts->{'VERBOSE_LEVEL'} < 1;
|
||||||
|
my $timestamp = &GetTimestamp('YMDHMS_r');
|
||||||
|
return "RUN $constants->{'TIMESTAMP'} completed @ $timestamp";
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
sub finishRun {
|
||||||
|
&Delimiter((caller(0))[3], 4);
|
||||||
|
$connection->disconnect();
|
||||||
|
outputLinks() if $getOpts->{'OUTPUT_LINKS'};
|
||||||
|
say runCompleted();
|
||||||
|
&logAssert(('-' x 80), 1);
|
||||||
|
say calcMinMaxAvg();
|
||||||
|
say calcRuntime();
|
||||||
|
say endBenchmark();
|
||||||
|
printToFile("$constants->{'RUN_DIR'}/config", \%config) if $getOpts->{'OUTPUT_CONFIG'};
|
||||||
|
printToFile("$constants->{'RUN_DIR'}/data", \%data) if $getOpts->{'OUTPUT_DATA'};
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
sub get_min_max_avg {
|
||||||
|
&Delimiter((caller(0))[3], 4);
|
||||||
|
return (0, 0, 0) if !@{$data{'LINKS'}{200}};
|
||||||
|
my ($sum);
|
||||||
|
my @all_links;
|
||||||
|
|
||||||
|
for my $key( keys %{$data{'LINKS'}} ) {
|
||||||
|
@all_links = (@all_links, @{$data{'LINKS'}{$key}});
|
||||||
|
}
|
||||||
|
|
||||||
|
foreach my $i (0 .. $#all_links) {
|
||||||
|
$all_links[$i] = length($all_links[$i]);
|
||||||
|
$sum += $all_links[$i];
|
||||||
|
}
|
||||||
|
|
||||||
|
return (scalar(min(@all_links)), scalar(max(@all_links)), int($sum / @all_links));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
sub get_checked_links {
|
||||||
|
&Delimiter((caller(0))[3], 4);
|
||||||
|
my $size = 0;
|
||||||
|
|
||||||
|
for my $key( keys %{$data{'LINKS'}} ) {
|
||||||
|
$size += scalar @{$data{'LINKS'}{$key}};
|
||||||
|
}
|
||||||
|
|
||||||
|
return $size;
|
||||||
|
}
|
||||||
|
|
||||||
|
#
|
||||||
|
# Helper methods
|
||||||
|
#
|
||||||
|
sub isInArray {
|
||||||
|
&Delimiter((caller(0))[3], 4);
|
||||||
|
my ($element, $arr_ref) = @_;
|
||||||
|
my %hash = map { $_ => 1 } @{$arr_ref};
|
||||||
|
return exists($hash{$element});
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
sub connectToMySql {
|
||||||
|
&Delimiter((caller(0))[3], 4);
|
||||||
|
my ($db) = @_;
|
||||||
|
my $connectionInfo = "dbi:mysql:database=$db;host=$mySQLAccess->{'hostname'};port=$mySQLAccess->{'port'}"; # assign config mysql values to a connection variable
|
||||||
|
my $l_connection = DBIx::Connector->new(
|
||||||
|
$connectionInfo,
|
||||||
|
$mySQLAccess->{'user'},
|
||||||
|
$mySQLAccess->{'pw'},
|
||||||
|
{
|
||||||
|
RaiseError => $mySQLOptions->{'RaiseError'}, # SUGGESTED BY AnyEvent::DBI::MySQL
|
||||||
|
AutoCommit => $mySQLOptions->{'AutoCommit'},
|
||||||
|
mysql_auto_reconnect => $mySQLOptions->{'Reconnect'},
|
||||||
|
}
|
||||||
|
);
|
||||||
|
|
||||||
|
return $l_connection->dbh;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
sub startBenchmarking {
|
||||||
|
&Delimiter((caller(0))[3], 4);
|
||||||
|
$data{'STATISTICS'}{'BENCHMARK_START'} = Benchmark->new;
|
||||||
|
$data{'STATISTICS'}{'TIME'}{'START'} = time();
|
||||||
|
$data{'STATISTICS'}{'CONTENT_LENGTH'} = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
sub printScriptHeaders {
|
||||||
|
&Delimiter((caller(0))[3], 4);
|
||||||
|
return if $getOpts->{'VERBOSE_LEVEL'} < 1;
|
||||||
|
my $timestamp_r = GetTimestamp('YMDHMS_r');
|
||||||
|
|
||||||
|
say '=' x 80;
|
||||||
|
say "$license";
|
||||||
|
say '-' x 80;
|
||||||
|
say 'Sitemap Generator v1.0.0 by Marlon Maschkiwitz';
|
||||||
|
say 'E-Mail: zino@zinomedia.de';
|
||||||
|
say '=' x 80;
|
||||||
|
say 'Start checking @ ', $timestamp_r . "\n";
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
sub printToFile{
|
||||||
|
&Delimiter((caller(0))[3], 4);
|
||||||
|
my ($filepath, $ref) = @_;
|
||||||
|
|
||||||
|
# DATA DUMPER UTF8 HACK
|
||||||
|
no warnings 'redefine';
|
||||||
|
local *Data::Dumper::qquote = sub { qq["${\(shift)}"] };
|
||||||
|
local $Data::Dumper::Useperl = 1;
|
||||||
|
|
||||||
|
open my $FILE, '>:encoding(UTF-8)', $filepath;
|
||||||
|
print $FILE Dumper $ref;
|
||||||
|
close $FILE;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
sub canCopy {
|
||||||
|
&Delimiter((caller(0))[3], 4);
|
||||||
|
return if !$getOpts->{'SITEMAP_SUBMIT'} && !$getOpts->{'SITEMAP_ROOT'};
|
||||||
|
my $tmpfile = &tempfile();
|
||||||
|
my $new_location = "$getOpts->{'SITEMAP_ROOT'}/tmp";
|
||||||
|
|
||||||
|
copy $tmpfile, $new_location or do {
|
||||||
|
unlink $tmpfile;
|
||||||
|
die "Cannot copy to $new_location: $!";
|
||||||
|
};
|
||||||
|
|
||||||
|
unlink($tmpfile, $new_location);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
sub db_query {
|
||||||
|
&Delimiter((caller(0))[3], 4);
|
||||||
|
my ($query, $connection, $switch, @rest) = @_;
|
||||||
|
my ($response, $statement);
|
||||||
|
$response = $connection->selectall_hashref($query, $rest[0]) or die $connection->errstr;
|
||||||
|
$statement->finish if $statement;
|
||||||
|
return $response;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
sub trim {
|
||||||
|
&Delimiter((caller(0))[3], 4);
|
||||||
|
my $s = shift;
|
||||||
|
return $s =~ s/^\s+|\s+$//g;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
sub is_link_limit {
|
||||||
|
&Delimiter((caller(0))[3], 4);
|
||||||
|
return if !$getOpts->{'LINK_LIMIT'};
|
||||||
|
if( exists $data{'LINKS'}{200} && scalar @{$data{'LINKS'}{200}} >= $getOpts->{'LINK_LIMIT'} ) {
|
||||||
|
&logAssert(('LINK_LIMIT of ' . $getOpts->{'LINK_LIMIT'} . ' reached'), 3);
|
||||||
|
$reached_link_limit = 1;
|
||||||
|
return 1;
|
||||||
|
}else {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
sub remove_www {
|
||||||
|
&Delimiter((caller(0))[3], 4);
|
||||||
|
my $str = shift;
|
||||||
|
return $str =~ s/^www\.//i;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
sub check_multiple_regex {
|
||||||
|
&Delimiter((caller(0))[3], 4);
|
||||||
|
my $str = shift;
|
||||||
|
|
||||||
|
no warnings 'experimental::smartmatch';
|
||||||
|
given ($str) {
|
||||||
|
when (@{$getOpts->{'LINK_REGEX'}}) {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
default {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
sub GetTimestamp {
|
||||||
|
&Delimiter((caller(0))[3], 4);
|
||||||
|
my $format = shift;
|
||||||
|
my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime(time);
|
||||||
|
return sprintf( "%04d%02d%02d_%02d%02d%02d", $year+1900,$mon+1,$mday,$hour,$min,$sec ) if $format eq 'YMDHMS';
|
||||||
|
return sprintf( "%04d-%02d-%02d %02d:%02d:%02d", 1900 + $year, $mon + 1, $mday, $hour, $min, $sec ) if $format eq 'YMDHMS_r';
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
sub Delimiter {
|
||||||
|
my ($subName, $verboseLevel) = @_;
|
||||||
|
return if $verboseLevel > $getOpts->{'VERBOSE_LEVEL'};
|
||||||
|
say "\n" . "-" x 80;
|
||||||
|
print "> " . $subName;
|
||||||
|
say "\n" . '-' x 80;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
sub get_runtime {
|
||||||
|
&Delimiter((caller(0))[3], 4);
|
||||||
|
my ($t1, $t2) = @_;
|
||||||
|
return strftime( "%T", gmtime( $t2-$t1 ) );
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
sub tempfile {
|
||||||
|
&Delimiter((caller(0))[3], 4);
|
||||||
|
return if !-e $constants->{'RUN_DIR'};
|
||||||
|
my $filename = "$constants->{'RUN_DIR'}/tmp";
|
||||||
|
open(my $fh, '>', $filename) or warn "Could not create tmp file '$filename' $!";
|
||||||
|
close($fh);
|
||||||
|
return $filename;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
sub outputLinks {
|
||||||
|
return if (!$getOpts->{'OUTPUT_LINKS'});
|
||||||
|
for my $key( keys %{$data{'LINKS'}} ) {
|
||||||
|
&logAssert(('Total links with status code $key found: ', scalar @{$data{'LINKS'}{$key}}), 1);
|
||||||
|
printToFilePlain("$constants->{'OUPUT_FILE'}_$key", \@{$data{'LINKS'}{$key}});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
sub printToFilePlain {
|
||||||
|
&Delimiter((caller(0))[3], 4);
|
||||||
|
my ($filepath, $ref) = @_;
|
||||||
|
open my $FILE, '>:encoding(UTF-8)', $filepath;
|
||||||
|
print $FILE join("\n", @{$ref});
|
||||||
|
close $FILE;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
__END__
|
||||||
|
|
||||||
|
=pod
|
||||||
|
|
||||||
|
=encoding UTF-8
|
||||||
|
|
||||||
|
=head1 NAME
|
||||||
|
|
||||||
|
BibSonomy Dynamic Sitemap Generator
|
||||||
|
|
||||||
|
=head1 IMPORTANT USAGE GUIDELINES
|
||||||
|
|
||||||
|
To run the script with the default options, run the script without arguments.
|
||||||
|
Some routines may require root level privileges.
|
||||||
|
|
||||||
|
=head1 SYNOPSIS
|
||||||
|
|
||||||
|
usage: script [--url <string>] [--link-limit <int>] [--link-depth <int>] [--sitemap-limit <int>] [--link-regex <string>] [--sitemap-root <string>] [--verbose-level <int>] [--responses <int>] [--nocompress] [--output-links] [--nocheck-unique] [--check-depth] [--check-responses] [--sitemap-submit] [--output-config]
|
||||||
|
|
||||||
|
=head1 OPTION ARGUMENTS
|
||||||
|
|
||||||
|
--url <string> | Domain including protocol that is placed in front of the URL generation process
|
||||||
|
|
||||||
|
--link-limit <int> | Maximum number of generated URLs. Note that generated URLs can differ from the limit provided up to ~10.000 URLs due to the database query offset.
|
||||||
|
|
||||||
|
--link-depth <int> | Maximum depth of generated URLs. Works only alongside --check-depth. Default: 3
|
||||||
|
|
||||||
|
--sitemap-limit <int> | Limit that determines when sitemap gets split into multiple sitemaps. Once a split occurs, a sitemap index including all single sitemaps will be also generated. Default: 50000
|
||||||
|
|
||||||
|
--link-regex <string> | If provided links must match regex to count as valid. Use multiple times to provide more than one. Default: Disabled
|
||||||
|
|
||||||
|
--sitemap-root <string> | Defines location of the root directory, to which sitemapsx will be moved. Only works alongside with --sitemap-submit. Default: Disabled
|
||||||
|
|
||||||
|
--verbose-level <int> | Specifies the level for the level of the displayed STDOUT messages
|
||||||
|
|
||||||
|
--responses <int> | Add response codes to count as valid link. Use multiple times to provide more than one. Only works alongside --check-responses. Default: 200
|
||||||
|
|
||||||
|
=head1 FLAG ARGUMENTS
|
||||||
|
|
||||||
|
--nocompress | Sitemaps will not be comressed as .gz files. Default: Compressed
|
||||||
|
|
||||||
|
--output-links | Determines if plain files with urls will be generated. Each status code generates a seperate file with the corresponding links. Default: Disabled
|
||||||
|
|
||||||
|
--nocheck-unique | Allows duplicate urls in the sitemaps.
|
||||||
|
|
||||||
|
--check-depth | Check the depth of the URLs. Maximum depth can be overwritten via --link-depths.
|
||||||
|
|
||||||
|
--check-responses | Check the status code of the generated URL. Permitted status codes can be overwritten via--responses
|
||||||
|
|
||||||
|
--sitemap-submit | Determines if sitemap files will be submitted to search engines. Sitemap files need to be moved to website root directory, therefore only works alongside --website-root.
|
||||||
|
|
||||||
|
--output-config | Output hash config to file config in the current run dir
|
||||||
|
|
||||||
|
--output-data | Output hash data to file data in the current run dir
|
||||||
|
|
||||||
|
=head1 DESCRIPTION
|
||||||
|
|
||||||
|
The Sitemap Protocol allows you to inform search engine crawlers about URLs on your Web sites that are available for crawling.
|
||||||
|
A Sitemap consists of a list of URLs and may also contain additional information about those URLs, such as when they were last modified, how frequently they change, etc.
|
||||||
|
This script allows you to dynamically generate URLs from the BibSonomy database and to create, modify and submit sitemaps to multiple search engines.
|
||||||
|
|
||||||
|
=head1 REQUIRED MODULES
|
||||||
|
|
||||||
|
L<Data::Validate::URI>
|
||||||
|
L<File::Path>
|
||||||
|
L<List::Util>
|
||||||
|
L<File::Copy>
|
||||||
|
L<Data::Dumper>
|
||||||
|
L<URI>
|
||||||
|
L<URI::Encode>
|
||||||
|
L<Getopt::Long>
|
||||||
|
L<File::Basename>
|
||||||
|
L<Search::Sitemap::Index>
|
||||||
|
L<LWP::UserAgent>
|
||||||
|
L<DBIx::Connector>
|
||||||
|
L<Search::Sitemap>
|
||||||
|
|
||||||
|
=head1 AUTHOR
|
||||||
|
|
||||||
|
Marlon Maschkiwitz - zino@zinomedia.de
|
||||||
|
|
||||||
|
=head1 COPYRIGHT AND LICENSE
|
||||||
|
|
||||||
|
MIT License
|
||||||
|
|
||||||
|
Copyright (c) 2021 Marlon Maschkiwitz
|
||||||
|
|
||||||
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
|
of this software and associated documentation files (the "Software"), to deal
|
||||||
|
in the Software without restriction, including without limitation the rights
|
||||||
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
copies of the Software, and to permit persons to whom the Software is
|
||||||
|
furnished to do so, subject to the following conditions:
|
||||||
|
|
||||||
|
The above copyright notice and this permission notice shall be included in all
|
||||||
|
copies or substantial portions of the Software.
|
||||||
|
|
||||||
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||||
|
SOFTWARE.
|
||||||
|
|
||||||
|
=cut
|
||||||
Reference in New Issue
Block a user