6 my $crawler_agents = qr{
8 sindice
-fetcher
| # http://sindice.com/developers/bot
11 libwww\
-perl
| # Comes from sup and other bots.
14 Mediapartners\
-Google
|
15 Jakarta\ Commons\
-HttpClient
| # comes from independent
16 aggregator
| # robots from spinn3r.com
22 lm114\
@nyu\
.edu
| # http://www.nyu.edu; lm114@nyu.edu
24 Rome\ Client
| # https://rome.dev.java.net/
28 www\
.fetch\
.com
| # www.fetch.com
33 PostRank
| # http://postrank.com
37 Vienna
| # http://www.vienna-rss.org
39 centerim
| # http://www.centerim.org/index.php/User_Manual#LiveJournal
41 Support\ Search\ Agent
| # This is our own abusebot
45 lwp\
-trivial
| # Comes from perl module LWP::Simple (script/bot)
46 BuzzTracker
| # http://www.buzztracker.com
50 Flexum
| # Flexum.ru search service
51 LucidMedia\ ClickSense
| # comes from amazonaws
52 Nutch
| # http://lucene.apache.org/nutch/about.html
55 Top\
-Indexer
| # Top-Indexer; http://www.artlebedev.ru; gregory@artlebedev.ru
57 relevantnoise\
.com
| # http://relevantnoise.com
60 LJpoisk\
.ru
| # RU Search Engine
61 Virtual\ Reach\ Newsclip\ Collector
|
68 Bloglovin
| # http://www.bloglovin.com/
72 Smokeping
| # http://oss.oetiker.ch/smokeping/
74 Gregarius
| # http://devlog.gregarius.net/docs/ua
75 blogged\_crawl
| # Nothing found on Google for this.
76 LjSEEK
| # http://www.ljseek.com/ or http://ljsearch.net
78 larbin
| # http://www.webmasterworld.com/forum11/2926.htm
80 LeapTag
| # http://leaptag.com/leaptag.php
82 online\
@monitoring\
.ru
|
84 gooblog
| # http://help.goo.ne.jp/contact/
86 heritrix
| # www.kit.edu
91 Megite
| # http://www.megite.com/
94 BTWebClient
| # utorrent.com
96 Amazon\
.com\ Blog\ Parser
|
100 Microsoft\ URL\ Control
|
103 CaRP
| # http://www.geckotribe.com/rss/carp/
105 LJSearch
| # http://www.ljseek.com/ or http://ljsearch.net
106 ^NIF
| # http://www.newsisfree.com/robot.php
107 StackRambler
| # Russian Search Engine: http://www.rambler.ru/
110 Sphere\ Scout
| # scout at sphere dot com
111 OpenISearch
| # http://www.openisearch.com/faq.html
112 CyberPsy
| # http://avalon.departament.com/lj-cyberpsy/disclaimer.html
113 WWWC
| # http://www.nakka.com/wwwc/
114 Filer\
.pro
| # Nothing found on Google for this.
115 Yacy
| # http://yacy.net/bot.html
116 Teleport\ Pro
| # http://www.tenmax.com/teleport/pro/home.htm
117 ShopWiki
| # http://www.shopwiki.com/wiki/Help:Bot
123 my ($class, $useragent) = @_;
125 return defined $useragent && $useragent =~ $crawler_agents;