diff --git a/mojo-useragent-bibsonomy.pl b/mojo-useragent-bibsonomy.pl new file mode 100644 index 0000000..c3f7bab --- /dev/null +++ b/mojo-useragent-bibsonomy.pl @@ -0,0 +1,867 @@ +my $license = <<"LICENSE"; +MIT License + +Copyright (c) 2021 Marlon Maschkiwitz + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +LICENSE + +#!/usr/bin/env perl +use strict; +use warnings; +use open qw(:locale); +use feature qw(say switch); +use POSIX qw(strftime); +use Data::Validate::URI qw(is_web_uri); +use File::Path qw(make_path); +use List::Util qw( min max uniq ); +use File::Copy qw(copy); +use utf8; +use Benchmark; +use Data::Dumper; +use URI; +use Getopt::Long; +use File::Basename; +use LWP::UserAgent; +use DBIx::Connector; +use URI::Encode; +use Search::Sitemap::Index; +use Search::Sitemap; + +# +# Global VARS +# +my (%uniq, $ua, %data, $connection, $reached_link_limit); +my $actionNr = 1; +my %config = ( + 'MYSQL' => { + 'ACCESS' => { + 'database' => 'bibsonomy', + 'hostname' => '127.0.0.1', + 'user' => 'mma', + 'pw' => 'BP5ZDavtZN', + 'port' => 3306, + }, + 'OPTIONS' => { + 'PrintError' => 0, + 'RaiseError' => 1, + 'AutoCommit' => 1, + 'Reconnect' => 1, + 'LimitPerQuery' => 10000, + }, + 'BIBSONOMY' => { + "publicationsUser" => { + "query" => "SELECT simhash1, user_name FROM bibtex b JOIN user u USING (user_name) WHERE b.group=0 AND u.spammer=0 AND u.to_classify=0 AND u.updated_by != 'classifier' AND u.updated_by IS NOT NULL AND b.user_name NOT IN ('dblp', 'genealogie') ORDER BY simhash1 LIMIT %d OFFSET %d", + "id_key" => "simhash1", + "prepend" => "publication", + }, + "publicationsPublic" => { + "query" => "SELECT simhash1, user_name FROM bibtex b JOIN user u USING (user_name) WHERE b.group=0 AND u.spammer=0 AND u.to_classify=0 AND u.updated_by != 'classifier' AND u.updated_by IS NOT NULL AND b.user_name NOT IN ('dblp', 'genealogie') ORDER BY simhash1 LIMIT %d OFFSET %d;", + "id_key" => "simhash1", + "prepend" => "publication", + }, + "groups" => { + "query" => "SELECT group_name from groupids g WHERE g.group > 2 LIMIT %d OFFSET %d;", + "id_key" => "group_name", + "prepend" => "group", + }, + "users" => { + "query" => "SELECT user_name FROM user u WHERE u.spammer=0 AND u.to_classify=0 AND u.updated_by != 'classifier' AND u.updated_by IS NOT NULL AND u.user_name NOT IN ('dblp', 'genealogie') LIMIT %d OFFSET %d;", + "id_key" => "user_name", + "prepend" => "user", + }, + }, + }, + 'PING' => { + 'GOOGLE' => { + 'URL' => 'http://www.google.com/ping?sitemap=', + }, + }, + 'GetOpts' => { + 'URL' => 'https://bibsonomy.org', + 'ALLOWED_RESPONSES' => [], + 'LINK_LIMIT' => undef, + 'LINK_DEPTH' => 3, + 'SITEMAP_LIMIT' => 50000, + 'SITEMAP_COMPRESS' => 1, + 'OUTPUT_LINKS' => 0, + 'LINK_REGEX' => [], + 'SITEMAP_ROOT' => undef, + 'SITEMAP_SUBMIT' => 0, + 'VERBOSE_LEVEL' => 0, + 'OUTPUT_CONFIG' => 0, + 'OUTPUT_DATA' => 0, + 'CHECK_UNIQUE' => 1, + 'CHECK_DEPTH' => 0, + 'CHECK_RESPONSES' => 0, + 'CHECK_REGEX' => 0, + }, + 'urlChecks' => { + 'CHECK_UNIQUE' => \&checkLinkUnique, + 'LINK_DEPTH' => \&checkLinkDepth, + 'CHECK_REGEX' => \&checkLinkRegex, + 'CHECK_RESPONSES' => \&checkLinkResponse, + }, + "staticLinks" => ['gettingStarted','help','groups','popular','tags','persons'], +); +my ($getOpts, $constants, $mySQLAccess, $mySQLOptions, $mySQLBibsonomy) = ($config{'GetOpts'}, $config{'CONSTANTS'}, $config{'MYSQL'}{'ACCESS'}, $config{'MYSQL'}{'OPTIONS'}, $config{'MYSQL'}{'BIBSONOMY'}); + + +# +# Workflow +# +&startBenchmarking(); +&getCommandFlags(); +&checkUrlArg(); +&checkSitemapRootArg(); +&checkResponsesArg(); +&setConfigConstants(); +&checkSitemapSubmitArg(); +&initCrawling(); +&canCopy(); +&printScriptHeaders(); +&runCrawlerBibsonomy(); +&generateSitemaps(); +&submitSitemaps(); +&finishRun(); + + +# +# Initialization methods +# +sub initCrawling { + &Delimiter((caller(0))[3], 4); + my $runDir = $constants->{'RUN_DIR'}; + + # REGEX ARGS + if (@{$getOpts->{'LINK_REGEX'}}) { + $getOpts->{'CHECK_REGEX'} = 1; + for my $i ( 0 .. scalar @{$getOpts->{'LINK_REGEX'}}-1 ) { + $getOpts->{LINK_REGEX}[$i] = qr/$getOpts->{LINK_REGEX}[$i]/; + } + } + + $connection = connectToMySql($mySQLAccess->{'database'}); + &logAssert("Creating run directory: $constants->{'RUN_DIR'}", 3); + make_path($constants->{'RUN_DIR'}, { chmod => 0777 }); + removeDuplicateValues(); +} + + +sub logAssert { + my ($message, $verboseLevel) = @_; + say $message if $verboseLevel <= $getOpts->{'VERBOSE_LEVEL'}; +} + + +sub getCommandFlags { + &Delimiter((caller(0))[3], 4); + GetOptions( + 'url=s' => \$getOpts->{'URL'}, + 'responses=s' => \@{$getOpts->{'ALLOWED_RESPONSES'}}, + 'link-limit=i' => \$getOpts->{'LINK_LIMIT'}, + 'link-depth=i' => \$getOpts->{'LINK_DEPTH'}, + 'sitemap-limit=i' => \$getOpts->{'SITEMAP_LIMIT'}, + 'compress!' => \$getOpts->{'SITEMAP_COMPRESS'}, + 'output-links' => \$getOpts->{'OUTPUT_LINKS'}, + 'link-regex=s' => \@{$getOpts->{'LINK_REGEX'}}, + 'sitemap-root=s' => \$getOpts->{'SITEMAP_ROOT'}, + 'sitemap-submit' => \$getOpts->{'SITEMAP_SUBMIT'}, + 'output-config' => \$getOpts->{'OUTPUT_CONFIG'}, + 'output-data' => \$getOpts->{'OUTPUT_DATA'}, + 'verbose-level=i' => \$getOpts->{'VERBOSE_LEVEL'}, + 'check-unique!' => \$getOpts->{'CHECK_UNIQUE'}, + 'check-depth!' => \$getOpts->{'CHECK_DEPTH'}, + 'check-responses!' => \$getOpts->{'CHECK_RESPONSES'}, + ) or die; +} + + +sub checkUrlArg() { + &Delimiter((caller(0))[3], 4); + return if !$getOpts->{'URL'}; + die "--url \"$getOpts->{'URL'}\" is not a valid web uri. Make sure it starts with http or https." if !is_web_uri($getOpts->{'URL'}); +} + + +sub checkResponsesArg { + &Delimiter((caller(0))[3], 4); + return if !@{$getOpts->{'ALLOWED_RESPONSES'}} && !$getOpts->{'CHECK_RESPONSES'}; + die "You must use --responses together with --check-responses" if !@{$getOpts->{'ALLOWED_RESPONSES'}} || !$getOpts->{'CHECK_RESPONSES'}; +} + + +sub checkSitemapRootArg { + &Delimiter((caller(0))[3], 4); + return if !$getOpts->{'SITEMAP_ROOT'}; + die "--sitemap-root \"$getOpts->{'SITEMAP_ROOT'}\" does not exist: $!" if (!-d $getOpts->{'SITEMAP_ROOT'}); +} + + +sub checkSitemapSubmitArg { + &Delimiter((caller(0))[3], 4); + return if !$getOpts->{'SITEMAP_ROOT'} && !$getOpts->{'SITEMAP_SUBMIT'}; + + if( $getOpts->{'SITEMAP_SUBMIT'} && !$getOpts->{'SITEMAP_ROOT'} ) { + die "When using --sitemap-submit you need also set --sitemap-root, which is the absolute path of the websites root folder."; + }elsif( $getOpts->{'SITEMAP_SUBMIT'} && $getOpts->{'SITEMAP_ROOT'} ) { + die "--sitemap-root \"$getOpts->{'SITEMAP_ROOT'}\" does not exist." if (!-e $getOpts->{'SITEMAP_ROOT'}); + die "--sitemap-root \"$getOpts->{'SITEMAP_ROOT'}\" is not a directory." if (!-d $getOpts->{'SITEMAP_ROOT'}); + } +} + + +sub removeDuplicateValues { + &Delimiter((caller(0))[3], 4); + foreach(['ALLOWED_RESPONSES', 'LINK_REGEX']) { + @{$getOpts->{$_}} = uniq(@{$getOpts->{$_}}); + } +} + + +sub setConfigConstants { + &Delimiter((caller(0))[3], 4); + + my $u = URI->new( $getOpts->{'URL'} ); + ($data{'URL'}{'SITEMAP_ROOT'} = $u->canonical->scheme . '://' . $u->canonical->host . $u->canonical->path) =~ s/\/$//; + $constants->{'URL_HOST'} = remove_www(URI->new($getOpts->{'URL'})->canonical->host ); + $constants->{'TIMESTAMP'} = GetTimestamp('YMDHMS'); + $constants->{'RUN_DIR'} = dirname(__FILE__) . '/run/' . $constants->{'TIMESTAMP'}; + $constants->{'SITEMAP_EXT'} = '.xml.gz' if $getOpts->{'SITEMAP_COMPRESS'}; + $constants->{'SITEMAP_EXT'} = '.xml' if !$getOpts->{'SITEMAP_COMPRESS'}; + $constants->{'SITEMAP_FILE'} = $constants->{'RUN_DIR'} . '/sitemap' . $constants->{'SITEMAP_EXT'}; + $constants->{'SITEMAP_INDEX_FILE'} = $constants->{'RUN_DIR'} . '/sitemap-index' . $constants->{'SITEMAP_EXT'}; + $constants->{'OUPUT_FILE'} = $constants->{'RUN_DIR'} . '/links'; +} + + +# +# BibSonomy Methods +# +sub getBibsonomyUrls { + &Delimiter((caller(0))[3], 4); + + foreach my $key (keys %{$mySQLBibsonomy}) { + my $offset = 0; + printActionNrString("Querying $key......"); + while ( my %offset = %{ db_query(sprintf($mySQLBibsonomy->{$key}{"query"}, $mySQLOptions->{'LimitPerQuery'}, $offset),$connection,'selectall_hashref',$mySQLBibsonomy->{$key}{"id_key"})}) { + return if is_link_limit(); + $offset += $mySQLOptions->{'LimitPerQuery'}; + my $valid = createBibsonomyUrls(\%offset, $key); + @{$data{'LINKS'}{200}} = (@{$data{'LINKS'}{200}}, @{$valid}); + &logAssert(('-> ' . scalar @{$data{'LINKS'}{200}}), 2); + } + } +} + + +sub createBibsonomyUrls { + &Delimiter((caller(0))[3], 4); + my ($results_ref, $sqlKey) = @_; + my @valid; + my $prepend = "$data{'URL'}{'SITEMAP_ROOT'}/$mySQLBibsonomy->{$sqlKey}{'prepend'}"; + + foreach my $key ( keys %{$results_ref} ) { + my $link_string = "$prepend/$key"; + $link_string .= "/$$results_ref{$key}{'user_name'}" if $sqlKey eq "publicationsUser"; + push(@valid, $link_string) if &is_valid_link(URI->new($link_string)); + } + + return \@valid; +} + + +sub runCrawlerBibsonomy { + &Delimiter((caller(0))[3], 4); + printActionNrString("Pushing static links..."); + @{$data{'LINKS'}{200}} = map { "$data{'URL'}{'SITEMAP_ROOT'}/$_" } (@{$config{'staticLinks'}}); + &logAssert(('-> ' . scalar @{$data{'LINKS'}{200}}, 2)); + getBibsonomyUrls(); +} + + +# +# Sitemap Methods +# +sub submitSitemaps { + &Delimiter((caller(0))[3], 4); + my $sitemapUrls = copySitemaps() if $getOpts->{'SITEMAP_ROOT'}; + return if !$getOpts->{'SITEMAP_SUBMIT'}; + pingSearchEngine($sitemapUrls); +} + + +sub copySitemaps { + &Delimiter((caller(0))[3], 4); + my @sitemapUrls; + + foreach my $i (0 .. scalar @{$data{'SITEMAPS'}}-1 ) { + my ($file,$dir,$ext) = fileparse($data{'SITEMAPS'}[$i], qr/\.[^.]*/); + my $new_location = $getOpts->{'SITEMAP_ROOT'} . '/' . $file . $ext; + my $sitemap_url = $data{'URL'}{'SITEMAP_ROOT'} . '/' . $file . $ext; + push( @sitemapUrls, $sitemap_url ); + copy $data{'SITEMAPS'}[$i], $new_location or warn "Copy failed: $new_location $!"; + } + + return \@sitemapUrls; +} + + +sub pingSearchEngine { + &Delimiter((caller(0))[3], 4); + my $sitemapUrls_ref = shift; + my $uri = URI::Encode->new( { encode_reserved => 0 } ); + + for my $url (@{$sitemapUrls_ref}) { + for my $engine( keys %{$config{'PING'}} ) { + my $ping_url = $config{'PING'}{$engine}{'URL'} . $uri->encode($url); + &logAssert("\tSubmitting to $engine: $url -> $ping_url", 3); + my $code = &getStatusCode($ping_url); + &logAssert("\t$code", 2); + } + } +} + + +sub printActionNrString{ + &Delimiter((caller(0))[3], 4); + my $message = shift; + return if $getOpts->{'VERBOSE_LEVEL'} < 1; + say "$actionNr) $message"; + $actionNr++; +} + + +sub createSitemap { + &Delimiter((caller(0))[3], 4); + my ($index, $sitemap) = @_; + my ($file,$dir,$ext) = fileparse($constants->{'SITEMAP_FILE'}, qr/\.[^.]*/); + my $output_file = sprintf("%ssitemap_%d%s", $dir,$index, $constants->{'SITEMAP_EXT'}); + printActionNrString("Writing $output_file..."); + $sitemap->write($output_file); + push( @{$data{'SITEMAPS'}}, $output_file ); +} + + +sub addToSitemapIndex { + &Delimiter((caller(0))[3], 4); + my ($index, $indexSitemap) = @_; + my $sitemap_file = sprintf("%s/sitemap_%d%s", $data{'URL'}{'SITEMAP_ROOT'}, $index, $constants->{'SITEMAP_EXT'}); + $indexSitemap->add(Search::Sitemap::URL->new(loc => $sitemap_file)); +} + + +sub generateSitemaps { + &Delimiter((caller(0))[3], 4); + my $indexSitemap = Search::Sitemap::Index->new('pretty' => 1); + my $sitemap = Search::Sitemap->new(pretty => 1); + my ($i, $index) = (0, 0); + + foreach my $url ( @{$data{'LINKS'}{'200'}} ) { + $sitemap->add(Search::Sitemap::URL->new(loc => $url)); + $i++; + if( $i == $getOpts->{'SITEMAP_LIMIT'} ) { + $index++; + $i = 0; + createSitemap($index, $sitemap); + addToSitemapIndex($index, $indexSitemap); + $sitemap = Search::Sitemap->new(pretty => 1); + } + } + + writeSitemapFiles($sitemap, $indexSitemap, $index, $i); +} + + +sub writeSitemapFiles { + &Delimiter((caller(0))[3], 4); + my ($sitemap, $indexSitemap, $index, $i) = @_; + + if( $i < $getOpts->{'SITEMAP_LIMIT'} && $index == 0 ) { + $sitemap->write( $constants->{'SITEMAP_FILE'} ); + push( @{$data{'SITEMAPS'}}, $constants->{'SITEMAP_FILE'} ); + }elsif( $i < $getOpts->{'SITEMAP_LIMIT'} && $index > 0 && $i > 0) { + $index++; + createSitemap($index, $sitemap); + addToSitemapIndex($index, $indexSitemap); + createSitemapIndex($indexSitemap); + }elsif( $i < $getOpts->{'SITEMAP_LIMIT'} && $index > 0 && $i == 0) { + createSitemapIndex($indexSitemap); + } +} + + +sub createSitemapIndex { + &Delimiter((caller(0))[3], 4); + my $indexSitemap = shift; + printActionNrString("Writing $constants->{'SITEMAP_INDEX_FILE'}..."); + $indexSitemap->write( $constants->{'SITEMAP_INDEX_FILE'} ); + push( @{$data{'SITEMAPS'}}, $constants->{'SITEMAP_INDEX_FILE'} ); +} + +# +# URL validate Methods +# +sub is_valid_link { + &Delimiter((caller(0))[3], 4); + my ($link) = @_; + my $link_depth = () = $link->path =~ /\/.+?/g if $getOpts->{'LINK_DEPTH'}; + my $link_host = &remove_www($link->host); + &logAssert("-> $link", 3); + my $code = getStatusCode($link->canonical->as_string) if $getOpts->{'CHECK_RESPONSES'}; + + for my $key (keys %{$config{'urlChecks'}}) { + next if !$getOpts->{$key}; + return 0 if !$config{'urlChecks'}{$key}->($link->canonical->as_string, $link->scheme, $link_depth, $link_host, $code); + } + + return 1; +} + + +sub checkLinkUnique { + &Delimiter((caller(0))[3], 4); + my ($link_string, $link_protocol, $link_depth, $link_host) = @_; + + if ( ++$uniq{$link_string} > 1 ) { + &logAssert("\tFAIL (UNIQ)", 3); + return 0; + } + + return 1; +} + + +sub checkLinkDepth { + &Delimiter((caller(0))[3], 4); + my ($link_string, $link_protocol, $link_depth, $link_host) = @_; + + if ($link_depth > $getOpts->{'LINK_DEPTH'}) { + &logAssert("\tFAIL (DEPTH $link_depth)", 3); + return 0; + } + + return 1; +} + + +sub checkLinkRegex { + &Delimiter((caller(0))[3], 4); + my ($link_string, $link_protocol, $link_depth, $link_host) = @_; + + if(!&check_multiple_regex($link_string) ) { + &logAssert("\tFAIL (REGEX)", 3); + return 0; + } + + return 1; +} + + +sub getStatusCode { + &Delimiter((caller(0))[3], 4); + my ($link_string, $link_protocol, $link_depth, $link_host) = @_; + my $uaLWP = new LWP::UserAgent; + my $response = $uaLWP->get($link_string); + return $response->code; +} + + +sub checkLinkResponse { + &Delimiter((caller(0))[3], 4); + my ($link_string, $link_protocol, $link_depth, $link_host, $code) = @_; + + if (!isInArray($code, $getOpts->{'ALLOWED_RESPONSES'})) { + &logAssert("\tFAIL (Status-Code not allowed: $code)", 3); + return 0; + } + + return 1; +} + + +# +# Finish Methods +# +sub calcMinMaxAvg { + return sprintf("URL lengths: min=%d, max=%d, avg=%d", get_min_max_avg() ) if $getOpts->{'VERBOSE_LEVEL'} >= 1; +} + + +sub calcRuntime { + return 'Runtime: ', &get_runtime($data{'STATISTICS'}{'TIME'}{'START'}, time()) if $getOpts->{'VERBOSE_LEVEL'} >= 1; +} + + +sub endBenchmark { + return if $getOpts->{'VERBOSE_LEVEL'} < 1; + my $t1 = Benchmark->new; + my $td = timediff($t1, $data{'STATISTICS'}{'BENCHMARK_START'}); + return "Benchmark: ", timestr($td); +} + + +sub runCompleted { + return if $getOpts->{'VERBOSE_LEVEL'} < 1; + my $timestamp = &GetTimestamp('YMDHMS_r'); + return "RUN $constants->{'TIMESTAMP'} completed @ $timestamp"; +} + + +sub finishRun { + &Delimiter((caller(0))[3], 4); + $connection->disconnect(); + outputLinks() if $getOpts->{'OUTPUT_LINKS'}; + say runCompleted(); + &logAssert(('-' x 80), 1); + say calcMinMaxAvg(); + say calcRuntime(); + say endBenchmark(); + printToFile("$constants->{'RUN_DIR'}/config", \%config) if $getOpts->{'OUTPUT_CONFIG'}; + printToFile("$constants->{'RUN_DIR'}/data", \%data) if $getOpts->{'OUTPUT_DATA'}; +} + + +sub get_min_max_avg { + &Delimiter((caller(0))[3], 4); + return (0, 0, 0) if !@{$data{'LINKS'}{200}}; + my ($sum); + my @all_links; + + for my $key( keys %{$data{'LINKS'}} ) { + @all_links = (@all_links, @{$data{'LINKS'}{$key}}); + } + + foreach my $i (0 .. $#all_links) { + $all_links[$i] = length($all_links[$i]); + $sum += $all_links[$i]; + } + + return (scalar(min(@all_links)), scalar(max(@all_links)), int($sum / @all_links)); +} + + +sub get_checked_links { + &Delimiter((caller(0))[3], 4); + my $size = 0; + + for my $key( keys %{$data{'LINKS'}} ) { + $size += scalar @{$data{'LINKS'}{$key}}; + } + + return $size; +} + +# +# Helper methods +# +sub isInArray { + &Delimiter((caller(0))[3], 4); + my ($element, $arr_ref) = @_; + my %hash = map { $_ => 1 } @{$arr_ref}; + return exists($hash{$element}); +} + + +sub connectToMySql { + &Delimiter((caller(0))[3], 4); + my ($db) = @_; + my $connectionInfo = "dbi:mysql:database=$db;host=$mySQLAccess->{'hostname'};port=$mySQLAccess->{'port'}"; # assign config mysql values to a connection variable + my $l_connection = DBIx::Connector->new( + $connectionInfo, + $mySQLAccess->{'user'}, + $mySQLAccess->{'pw'}, + { + RaiseError => $mySQLOptions->{'RaiseError'}, # SUGGESTED BY AnyEvent::DBI::MySQL + AutoCommit => $mySQLOptions->{'AutoCommit'}, + mysql_auto_reconnect => $mySQLOptions->{'Reconnect'}, + } + ); + + return $l_connection->dbh; +} + + +sub startBenchmarking { + &Delimiter((caller(0))[3], 4); + $data{'STATISTICS'}{'BENCHMARK_START'} = Benchmark->new; + $data{'STATISTICS'}{'TIME'}{'START'} = time(); + $data{'STATISTICS'}{'CONTENT_LENGTH'} = 0; +} + + +sub printScriptHeaders { + &Delimiter((caller(0))[3], 4); + return if $getOpts->{'VERBOSE_LEVEL'} < 1; + my $timestamp_r = GetTimestamp('YMDHMS_r'); + + say '=' x 80; + say "$license"; + say '-' x 80; + say 'Sitemap Generator v1.0.0 by Marlon Maschkiwitz'; + say 'E-Mail: zino@zinomedia.de'; + say '=' x 80; + say 'Start checking @ ', $timestamp_r . "\n"; +} + + +sub printToFile{ + &Delimiter((caller(0))[3], 4); + my ($filepath, $ref) = @_; + + # DATA DUMPER UTF8 HACK + no warnings 'redefine'; + local *Data::Dumper::qquote = sub { qq["${\(shift)}"] }; + local $Data::Dumper::Useperl = 1; + + open my $FILE, '>:encoding(UTF-8)', $filepath; + print $FILE Dumper $ref; + close $FILE; +} + + +sub canCopy { + &Delimiter((caller(0))[3], 4); + return if !$getOpts->{'SITEMAP_SUBMIT'} && !$getOpts->{'SITEMAP_ROOT'}; + my $tmpfile = &tempfile(); + my $new_location = "$getOpts->{'SITEMAP_ROOT'}/tmp"; + + copy $tmpfile, $new_location or do { + unlink $tmpfile; + die "Cannot copy to $new_location: $!"; + }; + + unlink($tmpfile, $new_location); +} + + +sub db_query { + &Delimiter((caller(0))[3], 4); + my ($query, $connection, $switch, @rest) = @_; + my ($response, $statement); + $response = $connection->selectall_hashref($query, $rest[0]) or die $connection->errstr; + $statement->finish if $statement; + return $response; +} + + +sub trim { + &Delimiter((caller(0))[3], 4); + my $s = shift; + return $s =~ s/^\s+|\s+$//g; +} + + +sub is_link_limit { + &Delimiter((caller(0))[3], 4); + return if !$getOpts->{'LINK_LIMIT'}; + if( exists $data{'LINKS'}{200} && scalar @{$data{'LINKS'}{200}} >= $getOpts->{'LINK_LIMIT'} ) { + &logAssert(('LINK_LIMIT of ' . $getOpts->{'LINK_LIMIT'} . ' reached'), 3); + $reached_link_limit = 1; + return 1; + }else { + return 0; + } +} + + +sub remove_www { + &Delimiter((caller(0))[3], 4); + my $str = shift; + return $str =~ s/^www\.//i; +} + + +sub check_multiple_regex { + &Delimiter((caller(0))[3], 4); + my $str = shift; + + no warnings 'experimental::smartmatch'; + given ($str) { + when (@{$getOpts->{'LINK_REGEX'}}) { + return 1; + } + default { + return 0; + } + } + + return 0; +} + + +sub GetTimestamp { + &Delimiter((caller(0))[3], 4); + my $format = shift; + my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime(time); + return sprintf( "%04d%02d%02d_%02d%02d%02d", $year+1900,$mon+1,$mday,$hour,$min,$sec ) if $format eq 'YMDHMS'; + return sprintf( "%04d-%02d-%02d %02d:%02d:%02d", 1900 + $year, $mon + 1, $mday, $hour, $min, $sec ) if $format eq 'YMDHMS_r'; +} + + +sub Delimiter { + my ($subName, $verboseLevel) = @_; + return if $verboseLevel > $getOpts->{'VERBOSE_LEVEL'}; + say "\n" . "-" x 80; + print "> " . $subName; + say "\n" . '-' x 80; +} + + +sub get_runtime { + &Delimiter((caller(0))[3], 4); + my ($t1, $t2) = @_; + return strftime( "%T", gmtime( $t2-$t1 ) ); +} + + +sub tempfile { + &Delimiter((caller(0))[3], 4); + return if !-e $constants->{'RUN_DIR'}; + my $filename = "$constants->{'RUN_DIR'}/tmp"; + open(my $fh, '>', $filename) or warn "Could not create tmp file '$filename' $!"; + close($fh); + return $filename; +} + + +sub outputLinks { + return if (!$getOpts->{'OUTPUT_LINKS'}); + for my $key( keys %{$data{'LINKS'}} ) { + &logAssert(('Total links with status code $key found: ', scalar @{$data{'LINKS'}{$key}}), 1); + printToFilePlain("$constants->{'OUPUT_FILE'}_$key", \@{$data{'LINKS'}{$key}}); + } +} + + +sub printToFilePlain { + &Delimiter((caller(0))[3], 4); + my ($filepath, $ref) = @_; + open my $FILE, '>:encoding(UTF-8)', $filepath; + print $FILE join("\n", @{$ref}); + close $FILE; +} + + +__END__ + +=pod + +=encoding UTF-8 + +=head1 NAME + +BibSonomy Dynamic Sitemap Generator + +=head1 IMPORTANT USAGE GUIDELINES + +To run the script with the default options, run the script without arguments. +Some routines may require root level privileges. + +=head1 SYNOPSIS + +usage: script [--url ] [--link-limit ] [--link-depth ] [--sitemap-limit ] [--link-regex ] [--sitemap-root ] [--verbose-level ] [--responses ] [--nocompress] [--output-links] [--nocheck-unique] [--check-depth] [--check-responses] [--sitemap-submit] [--output-config] + +=head1 OPTION ARGUMENTS + +--url | Domain including protocol that is placed in front of the URL generation process + +--link-limit | Maximum number of generated URLs. Note that generated URLs can differ from the limit provided up to ~10.000 URLs due to the database query offset. + +--link-depth | Maximum depth of generated URLs. Works only alongside --check-depth. Default: 3 + +--sitemap-limit | Limit that determines when sitemap gets split into multiple sitemaps. Once a split occurs, a sitemap index including all single sitemaps will be also generated. Default: 50000 + +--link-regex | If provided links must match regex to count as valid. Use multiple times to provide more than one. Default: Disabled + +--sitemap-root | Defines location of the root directory, to which sitemapsx will be moved. Only works alongside with --sitemap-submit. Default: Disabled + +--verbose-level | Specifies the level for the level of the displayed STDOUT messages + +--responses | Add response codes to count as valid link. Use multiple times to provide more than one. Only works alongside --check-responses. Default: 200 + +=head1 FLAG ARGUMENTS + +--nocompress | Sitemaps will not be comressed as .gz files. Default: Compressed + +--output-links | Determines if plain files with urls will be generated. Each status code generates a seperate file with the corresponding links. Default: Disabled + +--nocheck-unique | Allows duplicate urls in the sitemaps. + +--check-depth | Check the depth of the URLs. Maximum depth can be overwritten via --link-depths. + +--check-responses | Check the status code of the generated URL. Permitted status codes can be overwritten via--responses + +--sitemap-submit | Determines if sitemap files will be submitted to search engines. Sitemap files need to be moved to website root directory, therefore only works alongside --website-root. + +--output-config | Output hash config to file config in the current run dir + +--output-data | Output hash data to file data in the current run dir + +=head1 DESCRIPTION + +The Sitemap Protocol allows you to inform search engine crawlers about URLs on your Web sites that are available for crawling. +A Sitemap consists of a list of URLs and may also contain additional information about those URLs, such as when they were last modified, how frequently they change, etc. +This script allows you to dynamically generate URLs from the BibSonomy database and to create, modify and submit sitemaps to multiple search engines. + +=head1 REQUIRED MODULES + +L +L +L +L +L +L +L +L +L +L +L +L +L + +=head1 AUTHOR + +Marlon Maschkiwitz - zino@zinomedia.de + +=head1 COPYRIGHT AND LICENSE + +MIT License + +Copyright (c) 2021 Marlon Maschkiwitz + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +=cut