This script reads a file of urls to do multithreading HTTP requests.
How can I use an array with urls to make multithreading requests?
My array will have something like:
#array = ("https://example.com/xsd","https://example.com/xys","https://example.com/des","https://example.com/hduei");
I need to remove the function of reading file with urls, but I can not.
#!/usr/bin/perl
use strict;
use warnings;
use threads;
use threads::shared;
use Fcntl qw( LOCK_EX );
use IO::Handle qw( );
use LWP::UserAgent qw( );
use Thread::Queue 3.01 qw( );
use constant NUM_WORKERS => 20;
my $output_lock :shared;
my $output_fh;
sub write_to_output_file {
lock($output_lock);
print($output_fh #_);
$output_fh->flush();
}
sub worker {
my ($ua, $url) = #_;
my $response = $ua->get($url);
write_to_output_file("$url\n")
if $response->success
&& $response->content =~ /Exist/;
}
{
$output_fh = \*STDOUT; # Or open a file.
my $q = Thread::Queue->new();
for (1..NUM_WORKERS) {
async {
my $ua = LWP::UserAgent->new( timeout => 15 );
while (my $job = $q->dequeue()) {
worker($ua, $job);
}
};
}
while (<>) {
chomp;
$q->enqueue($_);
}
$q->end();
$_->join() for threads->list();
}
This part of the code you've shown reads filenames from the command line arguments, and then reads all the lines in these files. It then iterates over the lines.
while (<>) { # <--- here
chomp;
$q->enqueue($_);
}
You can replace that with an array, which of course needs a for loop. Make sure to remove the chomp, as there won't be a need to remove newlines.
foreach (#urls) {
$q->enqueue($_);
}
Related
I have an issue with a Perl script using thread.
It works fine when I launch it manually but when I launch it using crontab I have this feedback:
Perl exited with active threads:
0 running and unjoined
1 finished and unjoined
0 running and detached
The PATH variable and SHELL variable are correct on the crontad.
I try to make a init script (to launch as a service) and same error:
Feb 24 08:04:48 SERVER kernel: perl[103293]: segfault at 4a8 ip
00007f6cfd075dd9 sp 00007fffb93437c0 error 4 in
libperl.so[7f6cfcfdf000+183000] Feb 24 08:04:49 SERVER
test_ping[102238]: Perl exited with active threads: Feb 24 08:04:49
SERVER test_ping[102238]: 0 running and unjoined Feb 24 08:04:49
SERVER test_ping[102238]: 1 finished and unjoined Feb 24 08:04:49
SERVER test_ping[102238]: 0 running and detached
So I have also tried to modified the perl with:
for my $thread (threads->list) {
$thread->join();
}
Instead of
for my $thread (threads->list) {
$thread->detach();
}
And after this modification when I launch manually the script, this one seems to be stuck/freezing.
So to resume this is all my check:
Executed manually it work
Via crontab it doesn't work, check of PATH variable and SHELL
variable are ok
Via init script, doesn't work
Try to modify the perl script to join all thread but it the script
is freezing after that.
Anyone has a suggestion ? Something else to check/do ?
Thk
use lib '/usr/local/perf/lib';
use lib '/usr/share/perl5';
use threads;
use Thread::Queue;
use SNMP::Persist qw(&define_oid &start_persister &define_subtree);
use Schedule::ByClock;
use Time::HiRes qw( usleep ualarm gettimeofday tv_interval );
use strict;
#use warnings;
use constant DEBUG => 0;
use constant DEBUG2 => 1;
if ($#ARGV + 1 != 2) {
print "usage: test_ping.pl OUTPUTFILE INPUTFILE \n";
exit;
}
my $output_file=$ARGV[0];
my $data_file=$ARGV[1];
shift #ARGV;
shift #ARGV;
#start the thread serving answers
start_persister();
#create queue for processing commands
my $q_queue = new Thread::Queue;
my $r_queue = new Thread::Queue;
#create threads for processing queues
for(my $i= 0; $i < $thread_num; $i++) {
threads->create(\&process) -> detach();
}
my $datestring=localtime();
my %subtree;
my #raw_data;
my ($q_line, #q_split);
my ($r_line, #r_split);
my $index=0;
# open file to get data
open(DAT, $data_file) || die("Could not open file!");
#raw_data=<DAT>;
close(DAT);
# enqueue all lines to be process by threads
foreach $q_line (#raw_data) {
chomp($q_line);
$q_line =~ s/^\s+//;
$q_line =~ s/\s+$//;
next if ($q_line =~ /^#.*/);
next if ($q_line eq "");
next if ($q_line =~ /^\|/);
#q_split=split(/\|/,$q_line);
next if (!($q_split[0] eq "icmp" || $q_split[0] eq "tcp" || $q_split[0] eq "ldap" || $q_split[0] eq "dig" ));
$q_queue->enqueue(++$index ."|". $q_line);
}
while ($index != 0 && ($r_line = $r_queue->dequeue)) {
open(my $fh, '>>', $output_file) or die "Could not open file '$output_file' $!";
print $fh $datestring."|";
print $fh $r_line."\n";
close $fh;
#r_split=split(/\|/,$r_line);
$index--;
}
for my $thread (threads->list) {
$thread->detach();
}
Below the process fonction:
sub process {
# my #hotefqdn = split(/\./, `hostname`);
# my $hote=$hotefqdn[0];
my ($q_line,#q_split,$q_index,$q_query);
my ($q_module,$q_type,$q_name,$q_host,$q_port,$q_ssl,$q_send,$q_expect,$q_quit);
my ($q_lookup,$q_record);
my ($q_base_dn,$q_attr,$q_binddn,$q_password,$q_warn_time,$q_crit_time,$q_timeout);
my ($r_tab);
while ($q_line = $q_queue->dequeue) {
#q_split=split(/\|/,$q_line);
$q_index=$q_split[0];
$q_module=$q_split[1];
if ($q_module eq "icmp") {
$q_type=$q_split[2];
$q_name=$q_split[3];
$q_host=$q_split[4];
$q_query="$q_host (ping)";
print "query=$q_query\n" if(DEBUG);
$r_tab=icmp_query($q_host);
}
elsif ($q_module eq "tcp") {
$q_type=$q_split[2];
$q_name=$q_split[3];
$q_query="$q_host ($q_type:$q_port)";
print "query=$q_query\n" if(DEBUG);
$r_tab=tcp_query($q_host,$q_port,$q_ssl,$q_send,$q_expect,$q_quit);
}
elsif ($q_module eq "ldap") {
$q_type=$q_split[2];
$q_name=$q_split[3];
print "query=$q_query\n" if(DEBUG);
$r_tab=ldap_query($q_host,$q_base_dn,$q_port,$q_attr,$q_binddn,$q_password,$q_warn_time,$q_crit_time,$q_timeout);
}
elsif ($q_module eq "dig") {
$q_type=$q_split[2];
$q_name=$q_split[3];
$q_query="$q_lookup($q_record) #".$q_host;
print "query=$q_query\n" if(DEBUG);
$r_tab=dig_query($q_host,$q_port,$q_lookup,$q_record,$q_expect);
}
$r_queue->enqueue($q_index."|".$q_name."|".$q_type."|".$q_query."|".$r_tab->{'min'}."|".$r_tab->{'med'}."|".$r_tab->{'avg'}."|".$r_tab->{'max'}."|".$r_tab->{'dev'}."|".$r_tab->{'loss'}."|".$r_tab->{'err'});
}
}
First of all, don't detach your threads. When you do, you can't wait for them to finish.
for (my $i= 0; $i < $thread_num; $i++) {
threads->create(\&process) -> detach();
}
...
for my $thread (threads->list) {
$thread->detach();
}
should be
for (1..$thread_num) {
threads->create(\&process);
}
...
... Tell the threads to finish up ...
for my $thread (threads->list) {
$thread->join();
}
Now on the question: Why don't your threads finish? Well, you never tell them to exit, so they never do! You need to ask to them to exit, and that can be achieved by adding the following:
$q_queue->end();
What follows is what you get after you apply the above fixes. I've also moved all thread-related code out of process, since it doesn't belong there. Finally, I removed the reliance on $index by moving the output code into its own thread.
sub process {
my ($q_line) = #_;
...
return join("|", $q_index, $q_name, $q_type, $q_query, #$r_tab{qw( min med avg max dev loss err )});
}
my $request_q = Thread::Queue->new();
my $response_q = Thread::Queue->new();
my #worker_threads;
for (1..$thread_num) {
push #worker_threads, async {
while (defined( my $request = $request_q->dequeue() )) {
$response_q->enqueue( process($request) );
}
};
}
my $output_thread = do {
my $datestring = localtime();
open(my $fh, '>', $output_file)
or die("Can't create file \"$output_file\": $!\n");
async {
while (defined( my $response = $response_q->dequeue() )) {
print($fh "$datestring|$response\n");
}
}
};
{
my %protos = map { $_ => 1 } qw( icmp tcp ldap dig );
open(my $fh, '<', $data_file)
or die("Can't open file \"$data_file\": $!\n");
my $index = 0;
while (<$fh>) {
s/^\s+//;
s/\s+\z//;
next if $_ eq "" || /^#/;
my ($proto) = split /\|/;
next if !$protos{$proto};
$request_q->enqueue(++$index ."|". $_);
}
}
$request_q->end();
$_->join() for #worker_threads;
$response_q->end();
$output_threads->join();
I have a multi threaded Perl crawler which is working fine if I declare URLs in array.How ever if I read the URLs from DB I am getting "segmentation failure" error.Please help me to fix this issue.Thanks
Direct URL declaration
use 5.012; use warnings;
use threads;
use Thread::Queue;
use LWP::UserAgent;
use constant THREADS => 10;
my $queue = Thread::Queue->new();
my #URLs =qw(http://www.example.com
http://www.example.com1
http://www.example.com2 );
print #URLs;
my #threads;
for (1..THREADS) {
push #threads, threads->create(sub {
my $ua = LWP::UserAgent->new;
$ua->timeout(5); # short timeout for easy testing.
while(my $task = $queue->dequeue) {
my $response = eval{ $ua->get($task)->status_line };
say "$task --> $response";
}
});
}
$queue->enqueue(#URLs);
$queue->enqueue(undef) for 1..THREADS;
# ... here work is done
$_->join foreach #threads;
Trying to read the URLs from DB
my $dbh = DBI->connect("DBI:mysql:$database;host=$server", $username, $password) # Get the rows from database
|| die "Could not connect to database: $DBI::errstr";
my $sth = $dbh->prepare('select cname,url,xpath,region from competitors') #query to select required fields
|| die "$DBI::errstr";
$sth->execute();
if ($sth->rows < 0) {
print "Sorry, no domains found.\n";
}
else {
while (my $results = $sth->fetchrow_hashref) {
my $competitor= $results->{cname};
my $url = $results->{url};
my $xpath = $results->{xpath};
my $region = $results->{region};
push(my #all,$url);
use constant THREADS => 10;
my $queue = Thread::Queue->new();
my #URLs=#all;
my #threads;
for (1..THREADS) {
push #threads, threads->create(sub {
my $ua = LWP::UserAgent->new;
$ua->timeout(500); # short timeout for easy testing.
while(my $task = $queue->dequeue) {
my $response = eval{ $ua->get($task)->status_line };
print "$task --> $response";
}
});
}
$queue->enqueue( #URLs);
$queue->enqueue(undef) for 1..THREADS;
# ... here work is done
$_->join foreach #threads;
}
} #close db
$sth->finish;
$dbh->disconnect;
Expected o/p
www.example.com-->200 ok
www.example.com1-->200 ok
Current o/p
Segmentation error
Your $sth and $dbh are still around when you create the thread, creating a copy of them, which is a no-no.
newly created threads must make their own connections to the database. Handles can't be shared across threads.
Better scoping of your variables should avoid the problem.
use strict;
use warnings;
use threads;
use Thread::Queue 3.01 qw( );
use constant NUM_WORKERS => 10;
sub worker {
my ($ua, $url) = #_;
...
}
{
my $q = Thread::Queue->new();
for (1..NUM_WORKERS) {
async {
my $ua = LWP::UserAgent->new();
while ( my $url = $q->dequeue() ) {
eval { worker($ua, $url); 1 }
or warn $#;
}
};
}
{
my $dbh = DBI->connect(..., { RaiseError => 1 });
my $sth = $dbh->prepare('SELECT ...');
$sth->execute();
while ( my $row = $sth->fetchrow_hashref() ) {
$q->enqueue($row->{url});
}
}
$q->end();
$_->join for threads->list;
}
You should declare #all outside of the while loop, then, when the URLs are pushed, close that loop and go on
my $dbh = DBI->connect("DBI:mysql:$database;host=$server", $username, $password) # Get the rows from database
|| die "Could not connect to database: $DBI::errstr";
my $sth = $dbh->prepare('select cname,url,xpath,region from competitors') #query to select required fields
|| die "$DBI::errstr";
$sth->execute();
# >> declare your URL-array before starting to fetch
my #URLs;
if ($sth->rows < 0) {
print "Sorry, no domains found.\n";
}
else {
while (my $results = $sth->fetchrow_hashref) {
my $competitor= $results->{cname};
my $url = $results->{url};
my $xpath = $results->{xpath};
my $region = $results->{region};
push(#URLs,$url);
}
}
$sth->finish;
$dbh->disconnect;
use constant THREADS => 10;
my $queue = Thread::Queue->new();
my #threads;
for (1..THREADS) {
push #threads, threads->create(sub {
my $ua = LWP::UserAgent->new;
$ua->timeout(500); # short timeout for easy testing.
while(my $task = $queue->dequeue) {
my $response = eval{ $ua->get($task)->status_line };
print "$task --> $response";
}
});
}
$queue->enqueue( #URLs);
$queue->enqueue(undef) for 1..THREADS;
# ... here work is done
$_->join foreach #threads;
Segfaults are very rare as a result of perl code. They're memory related, and usually mean a problem in external binaries. (I'd be betting on DBI here)
Threads in particular have a lot of legacy issues - they're getting better in newer versions of perl though. I would strongly recommend that you consider upgrading to a recent version of perl if you haven't already. I know that isn't always an option, but it's a good idea.
It's really hard to second guess your problem, as I don't have your DB, so I can't recreate it.
I would suggest that generally there's a few things you can do to keep threads 'clean' - the way your code works, is the DB handles are in scope within the threads. I would avoid doing that. Declare the thread sub right at the top, with as narrow a scope as possible.
I will note though:
push ( my #all, $url );
probably doesn't do what you think it does!
But yes, taking your code I'd put it like this:
#!/usr/bin/perl
use strict;
use warnings;
use threads;
use Thread::Queue;
use LWP;
my $num_threads = 10;
my $work_q = Thread::Queue->new();
sub worker {
my $ua = LWP::UserAgent->new;
$ua->timeout(500); # short timeout for easy testing.
while ( my $task = $work_q->dequeue ) {
my $response = eval { $ua->get($task)->status_line };
print "$task --> $response";
}
}
## fetch_list
sub fetch_url_list {
my $dbh = DBI->connect( "DBI:mysql:$database;host=$server",
$username, $password ) # Get the rows from database
|| die "Could not connect to database: $DBI::errstr";
my $sth =
$dbh->prepare( 'select cname,url,xpath,region from competitors'
) #query to select required fields
|| die "$DBI::errstr";
$sth->execute();
if ( $sth->rows < 0 ) {
print "Sorry, no domains found.\n";
}
else {
while ( my $results = $sth->fetchrow_hashref ) {
my $competitor = $results->{cname};
my $url = $results->{url};
my $xpath = $results->{xpath};
my $region = $results->{region};
$work_q -> enqueue ( $url );
}
}
$sth->finish;
$dbh->disconnect;
}
for ( 1 .. $num_threads ) {
threads->create( \&worker );
}
fetch_url_list();
$work_q->end;
foreach my $thr ( threads->list() ) {
$thr->join();
}
This way - none of your threads have the DB stuff 'in scope', and the DB doesn't have thread stuff in scope. That reduces odds of 'pollution' causing you problems. In particular - threads when they start 'copy' everything in the current scope, which can do really wierd things when they're objects. (e.g. such as the DB handles)
Failing that, I would consider looking at a 'forking' approach. Threads are good at passing data back and forth, but forks are generally more efficient (definitely on Unix based systems) when you don't need to pass data back and forth (and you don't, you're just running a test and printing a result).
Below is a simplified example of my problem. Here exec should give an error because xecho doesn't exist.
Question
Is there a way to have Capture::Tiny capture the output from Parallel::ForkManager?
#!/usr/bin/perl
use strict;
use warnings;
use Parallel::ForkManager;
use Capture::Tiny 'capture';
my ($stdout, $stderr, $exit) = capture {
my $pm = Parallel::ForkManager->new(5);
my $pid = $pm->start;
if (!$pid) {
no warnings; # no warnings "exec" is not working
exec("xecho test");
$pm->finish;
}
};
print "$stdout\n";
print "$exit\n";
print "$stderr\n";
You cannot use Capture::Tiny to capture output from a child process, but you could use the run_on_finish method from Parallel::ForkManager :
use strict;
use warnings;
use Capture::Tiny qw(capture);
use Data::Dump;
use Parallel::ForkManager;
my $pm = Parallel::ForkManager->new(5);
$pm -> run_on_finish (
sub {
my (
$pid, $exit_code, $ident, $exit_signal,
$core_dump, $data_structure_reference
) = #_;
my $info = ${$data_structure_reference};
print "Received from child: \n";
dd $info;
}
);
my $pid = $pm->start;
if (!$pid) {
my ($stdout, $stderr, $exit) = capture {
sleep 4;
exec("xecho");
};
my $info = {stdout => $stdout, stderr => $stderr, exit=> $exit};
$pm->finish(0, \$info);
}
print "Master: waiting for child..\n";
$pm->wait_all_children;
Output:
Master: waiting for child..
Received from child:
{
exit => 0,
stderr => "Can't exec \"xecho\": No such file or directory at ./p.pl line 28.\n",
stdout => "",
}
In c#, we can create thread and start thread only on need like following (if I am correct)
Thread th=new thread("function");
th.start()
But in perl, when I create itself it has started. For example
$thread1=thread->create("function");
But I want to create 4 thread. I should start only on need. And I've to check whether it's running or not? if thread is not running, then I've to start the same thread by passing different parameter. How to do that in perl ?
Multiple jobs can be sent into the queue, and they are waiting for their turn to be passed to the worker.
use strict;
use warnings;
use threads;
use Thread::Queue;
my $no_of_workers = 4;
my $q = Thread::Queue->new();
# Worker thread
my #thr = map {
my $t = threads->create(sub{
# Thread will loop until no more work
while (defined(my $item = $q->dequeue())) {
# Do work on $item
print "$item\n";
}
});
{queue => $q, thread => $t, id => $_};
} 1 .. $no_of_workers;
# Send work to each thread
$_->{queue}->enqueue("Job for thread $_->{id}") for #thr;
for (#thr) {
# Signal that there is no more work to be sent
# $_->{queue}->end();
# similar to $queue->end() for older perl
$_->{queue}->enqueue(undef) for #thr;
# wait for threads to finish
$_->{thread}->join();
}
Assigning jobs 0..19 in circular way to workers,
for my $i (0 .. 19) {
my $t = $thr[$i % #thr]; # $i % #thr => 0,1,2,3, 0,1,2,3, ..
$t->{queue}->enqueue("Job for thread $t->{id}");
}
You don't want a queue for each thread! You'll end up with idle threads even if work's available.
use strict;
use warnings;
use threads;
use Thread::Queue 3.01 qw( );
use constant NUM_WORKERS => 4;
sub worker {
my ($job) = #_;
print("Job: $job\n");
sleep(rand(4)); # Pretending to do $job
}
{
my $q = Thread::Queue->new();
for (1..NUM_WORKERS) {
async {
while (defined(my $item = $q->dequeue())) {
worker($item);
}
};
}
# Give the workers work to do.
$q->enqueue($_) for 1..14;
# When you're done adding, wait for the workers to finish.
$q->end();
$_->join() for threads->list;
}
This code only does 4 threads, and then stops. It doesn't process the remaining 6 items in the queue.
I'm by no means an expert in perl or multithreading but I'm convinced I'm "doing it wrong" and need some guidance as to how to alter it so that I don't get thread quit warnings.
As you can see, this program reads in argument 0, does a lookup to find each of the IP addresses associated with a host name, then tests each IP to see if ssh is running.
The purpose of this script is to spawn a tcp test for each host and return the first successful tcp connect.
Can anyone suggest a way of doing this that is more reliable and doesn't require a sleep?
use strict;
use warnings;
BEGIN {
use Config;
$Config{useithreads} or die('Recompile Perl with threads to run this program.');
}
use threads;
use IO::Socket::INET;
$| = 1;
unless($ARGV[0]){ die("error please use argument")}
my $timeoutval=3;
my $testHost=$ARGV[0];
my $dlquery=`dig $testHost | grep 'IN A'`;
my $SUCCESS=0;
sub testSSHhost {
my $fqdn = shift;
my $socket = new IO::Socket::INET (
PeerHost => $fqdn,
PeerPort => '22',
Proto => 'tcp',
Timeout => $timeoutval,
) or return "ERROR in Socket Creation : $!\n";
my $tcpData = <$socket>;
$socket->close();
if ($tcpData && $tcpData=~/SSH/){
print "$fqdn\n";
$SUCCESS=1;
exit(0);
}
return $fqdn;
}
my #threads;
for my $line (split(/\n/,$dlquery)){
my #linesplit=split(/ /,$line);
$linesplit[0]=~s/\.$//;
$linesplit[0]=~s/ *//g;
my $t = threads->new(\&testSSHhost, $linesplit[0]);
push(#threads,$t);
}
while (!$SUCCESS){sleep 0.3}
Really what i'm trying to avoid is the "A thread exited while 2 threads were running." error message
Or a "segmentation fault" message
Something like this (not tested!):
use Modern::Perl;
use threads;
use Thread::Queue;
use IO::Socket::INET;
$| = 1;
my $testHost = $ARGV[0];
my $dlquery = `dig $testHost | grep 'IN A'`;
my $config = { NUMBER_OF_THREADS => 5 }; #how many threads you gonna use?
my $queue = Thread::Queue->new;
my $queue_processed = Thread::Queue->new;
for my $line ( split( /\n/, $dlquery ) ) {
my ($ip) = split( / /, $line );
$ip =~ s/\.$//;
$ip =~ s/ *//g;
$queue->enqueue($ip);
}
foreach my $thread_id ( 1 .. $config->{NUMBER_OF_THREADS} ) {
$queue->enqueue(undef);
my $thread = threads->create( \&testSSHhost() )->detach();
}
while ( $queue->pending() ) {
my $result = $queue_processed->dequeue();
if ( $result->{status} ) {
say $result->{ip};
}
}
sub testSSHhost {
while ( my $fqdn = $queue->dequeue() ) {
my $status = 0;
my $socket = new IO::Socket::INET(
PeerHost => $fqdn,
PeerPort => 22,
Proto => 'tcp',
Timeout => 3,
) or return "ERROR in Socket Creation : $!\n";
my $tcpData = <$socket>;
$socket->close();
if ( $tcpData && $tcpData =~ /SSH/ ) {
$status = 1;
}
$queue_processed->enqueue( { ip => $fqdn, status => $status, } );
}
return 0;
}
You can achieve it with Qeues:
http://search.cpan.org/dist/Thread-Queue/lib/Thread/Queue.pm
Before spawning the threads, you create a queue, and then let the threads push the successful IP address in it. Then the parent process would block in dequeuing until something pops up.