1111 */
1212
1313import { ok } from '@adobe/spacecat-shared-http-utils' ;
14- import { CloudWatchLogsClient , FilterLogEventsCommand } from '@aws-sdk/client-cloudwatch-logs' ;
1514import RUMAPIClient from '@adobe/spacecat-shared-rum-api-client' ;
1615import GoogleClient from '@adobe/spacecat-shared-google-client' ;
1716import { ScrapeClient } from '@adobe/spacecat-shared-scrape-client' ;
18- import { resolveCanonicalUrl , formatAllowlistMessage } from '@adobe/spacecat-shared-utils' ;
19- import { say , formatBotProtectionSlackMessage } from '../../utils/slack-utils.js' ;
20- import { queryBotProtectionLogs , aggregateBotProtectionStats } from '../../utils/cloudwatch-utils.js' ;
17+ import { resolveCanonicalUrl } from '@adobe/spacecat-shared-utils' ;
18+ import {
19+ checkAndAlertBotProtection ,
20+ checkAuditExecution ,
21+ getAuditFailureReason ,
22+ } from '../../utils/cloudwatch-utils.js' ;
23+ import { say } from '../../utils/slack-utils.js' ;
2124import { getOpportunitiesForAudit } from './audit-opportunity-map.js' ;
2225import { OPPORTUNITY_DEPENDENCY_MAP } from './opportunity-dependency-map.js' ;
2326
2427const TASK_TYPE = 'opportunity-status-processor' ;
25- const AUDIT_WORKER_LOG_GROUP = '/aws/lambda/spacecat-services--audit-worker' ;
2628
2729/**
2830 * Checks if RUM is available for a domain by attempting to get a domainkey
@@ -128,9 +130,10 @@ function getOpportunityTitle(opportunityType) {
128130 *
129131 * @param {string } baseUrl - The base URL to check
130132 * @param {object } context - The context object with env and log
133+ * @param {number } [onboardStartTime] - Optional onboard start timestamp to filter jobs
131134 * @returns {Promise<{available: boolean, results: Array}> } Scraping availability and URL results
132135 */
133- async function isScrapingAvailable ( baseUrl , context ) {
136+ async function isScrapingAvailable ( baseUrl , context , onboardStartTime ) {
134137 const { log } = context ;
135138
136139 try {
@@ -152,8 +155,20 @@ async function isScrapingAvailable(baseUrl, context) {
152155 return { available : false , results : [ ] } ;
153156 }
154157
155- // Sort jobs by date (latest first) - assuming jobs have a timestamp field
156- const sortedJobs = jobs . sort ( ( a , b ) => {
158+ // Filter jobs created after onboardStartTime
159+ const filteredJobs = jobs . filter ( ( job ) => {
160+ const jobTimestamp = new Date ( job . startedAt || job . createdAt || 0 ) . getTime ( ) ;
161+ return jobTimestamp >= onboardStartTime ;
162+ } ) ;
163+ log . info ( `Filtered ${ filteredJobs . length } jobs created after onboardStartTime from ${ jobs . length } total jobs` ) ;
164+
165+ if ( filteredJobs . length === 0 ) {
166+ log . info ( `No scrape jobs found for ${ baseUrl } after onboardStartTime ${ new Date ( onboardStartTime ) . toISOString ( ) } ` ) ;
167+ return { available : false , results : [ ] } ;
168+ }
169+
170+ // Sort jobs by date (latest first)
171+ const sortedJobs = filteredJobs . sort ( ( a , b ) => {
157172 const dateA = new Date ( b . startedAt || b . createdAt || 0 ) ;
158173 const dateB = new Date ( a . startedAt || a . createdAt || 0 ) ;
159174 return dateA - dateB ;
@@ -208,133 +223,6 @@ async function isScrapingAvailable(baseUrl, context) {
208223 * @param {object } context - The context object with log
209224 * @returns {object|null } Bot protection details if detected, null otherwise
210225 */
211- /**
212- * Detects bot protection by checking CloudWatch logs for bot protection events.
213- *
214- * Content Scraper logs bot protection events to CloudWatch, making logs the source of truth.
215- * This function uses searchStartTime to query CloudWatch logs.
216- *
217- * @param {string } scrapeJobId - The scrape job ID for CloudWatch log querying
218- * @param {number } searchStartTime - Search start timestamp (ms) to limit CloudWatch query window
219- * @param {object } context - The context object with env, log
220- * @returns {Promise<object|null> } Bot protection statistics or null
221- */
222- async function checkBotProtectionInScrapes (
223- scrapeJobId ,
224- searchStartTime ,
225- context ,
226- ) {
227- const { log } = context ;
228-
229- if ( ! scrapeJobId || ! searchStartTime ) {
230- log . debug ( '[BOT-BLOCKED] Skipping bot protection check: missing scrapeJobId or searchStartTime' ) ;
231- return null ;
232- }
233-
234- log . info ( `[BOT-BLOCKED] Querying CloudWatch logs for bot protection from ${ new Date ( searchStartTime ) . toISOString ( ) } ` ) ;
235- const logEvents = await queryBotProtectionLogs ( scrapeJobId , context , searchStartTime ) ;
236-
237- if ( logEvents . length === 0 ) {
238- // No bot protection detected in logs
239- return null ;
240- }
241-
242- // Parse and aggregate bot protection statistics from logs
243- const botProtectionStats = aggregateBotProtectionStats ( logEvents ) ;
244- log . warn ( `[BOT-BLOCKED] Bot protection detected: ${ botProtectionStats . totalCount } URLs blocked (from CloudWatch logs) for job ${ scrapeJobId } ` ) ;
245-
246- return botProtectionStats ;
247- }
248-
249- /**
250- * Searches CloudWatch logs for audit execution
251- * @param {string } auditType - The audit type to search for
252- * @param {string } siteId - The site ID
253- * @param {number } onboardStartTime - The onboarding start timestamp
254- * @param {object } context - The context object
255- * @returns {Promise<boolean> } Whether the audit was executed
256- */
257- async function checkAuditExecution ( auditType , siteId , onboardStartTime , context ) {
258- const { log, env } = context ;
259- const logGroupName = AUDIT_WORKER_LOG_GROUP ;
260-
261- try {
262- const cloudWatchClient = new CloudWatchLogsClient ( { region : env . AWS_REGION || 'us-east-1' } ) ;
263- const filterPattern = `"Received ${ auditType } audit request for: ${ siteId } "` ;
264-
265- // Add small buffer before onboardStartTime to account for clock skew and processing delays
266- // The audit log should be after onboardStartTime, but we add a small buffer for safety
267- const bufferMs = 60 * 1000 ; // 1 minute
268- const searchStartTime = onboardStartTime ? onboardStartTime - bufferMs : undefined ;
269-
270- const command = new FilterLogEventsCommand ( {
271- logGroupName,
272- filterPattern,
273- startTime : searchStartTime ,
274- endTime : Date . now ( ) ,
275- } ) ;
276-
277- const response = await cloudWatchClient . send ( command ) ;
278- const found = response . events && response . events . length > 0 ;
279-
280- return found ;
281- } catch ( error ) {
282- log . error ( `Error checking audit execution for ${ auditType } :` , error ) ;
283- return false ;
284- }
285- }
286-
287- /**
288- * Searches CloudWatch logs for audit failure reason
289- * @param {string } auditType - The audit type to search for
290- * @param {string } siteId - The site ID
291- * @param {number } onboardStartTime - The onboarding start timestamp
292- * @param {object } context - The context object
293- * @returns {Promise<string|null> } The failure reason or null if not found
294- */
295- async function getAuditFailureReason ( auditType , siteId , onboardStartTime , context ) {
296- const { log, env } = context ;
297- const logGroupName = AUDIT_WORKER_LOG_GROUP ;
298-
299- try {
300- const cloudWatchClient = new CloudWatchLogsClient ( { region : env . AWS_REGION || 'us-east-1' } ) ;
301- const filterPattern = `"${ auditType } audit for ${ siteId } failed"` ;
302-
303- // Add small buffer before onboardStartTime to account for clock skew and processing delays
304- const bufferMs = 30 * 1000 ; // 30 seconds
305- const searchStartTime = onboardStartTime ? onboardStartTime - bufferMs : undefined ;
306-
307- const command = new FilterLogEventsCommand ( {
308- logGroupName,
309- filterPattern,
310- startTime : searchStartTime ,
311- endTime : Date . now ( ) ,
312- } ) ;
313-
314- const response = await cloudWatchClient . send ( command ) ;
315-
316- if ( response . events && response . events . length > 0 ) {
317- // Extract reason from the message
318- const { message } = response . events [ 0 ] ;
319- const reasonMatch = message . match ( / R e a s o n : \s * ( [ ^ ] + ?) (?: \s + a t \s | $ ) / ) ;
320- if ( reasonMatch && reasonMatch [ 1 ] ) {
321- return reasonMatch [ 1 ] . trim ( ) ;
322- }
323- // Fallback: return entire message if "Reason:" pattern not found
324- return message . trim ( ) ;
325- }
326-
327- return null ;
328- /* c8 ignore start */
329- // Defensive error handling: Difficult to test as requires CloudWatch API to throw errors.
330- // Would need complex AWS SDK mocking infrastructure for marginal coverage gain.
331- } catch ( error ) {
332- log . error ( `Error checking audit failure for ${ auditType } :` , error ) ;
333- return null ;
334- }
335- /* c8 ignore stop */
336- }
337-
338226/**
339227 * Analyzes missing opportunities and determines the root cause
340228 * @param {Array<string> } missingOpportunities - Array of missing opportunity types
@@ -529,7 +417,27 @@ export async function runOpportunityStatusProcessor(message, context) {
529417 }
530418
531419 if ( needsScraping ) {
532- const scrapingCheck = await isScrapingAvailable ( siteUrl , context ) ;
420+ // Check for bot protection FIRST before fetching scrape results
421+ const botProtectionStats = await checkAndAlertBotProtection ( {
422+ siteId,
423+ siteUrl,
424+ searchStartTime : onboardStartTime ,
425+ slackContext,
426+ context,
427+ } ) ;
428+
429+ // Abort processing if bot protection detected
430+ if ( botProtectionStats && botProtectionStats . totalCount > 0 ) {
431+ log . warn ( `[BOT-BLOCKED] Bot protection blocking scrapes for ${ siteUrl } ` ) ;
432+ return ok ( {
433+ message : `Bot protection detected for ${ siteUrl } ` ,
434+ botProtectionDetected : true ,
435+ blockedUrlCount : botProtectionStats . totalCount ,
436+ } ) ;
437+ }
438+
439+ // Only check scraping availability if no bot protection detected
440+ const scrapingCheck = await isScrapingAvailable ( siteUrl , context , onboardStartTime ) ;
533441 scrapingAvailable = scrapingCheck . available ;
534442
535443 // Send Slack notification with scraping statistics if available
@@ -551,48 +459,6 @@ export async function runOpportunityStatusProcessor(message, context) {
551459 await say ( env , log , slackContext , statsMessage ) ;
552460 }
553461 }
554-
555- // Check for bot protection via CloudWatch logs
556- log . info ( `[BOT-BLOCKED] Bot protection check conditions: jobId=${ ! ! scrapingCheck . jobId } , slackContext=${ ! ! slackContext } , onboardStartTime=${ ! ! onboardStartTime } ` ) ;
557-
558- if ( scrapingCheck . jobId && slackContext ) {
559- // Use onboardStartTime if available, otherwise use a reasonable fallback (24 hours ago)
560- const searchStartTime = onboardStartTime || ( Date . now ( ) - ( 24 * 60 * 60 * 1000 ) ) ;
561- log . info ( `[BOT-BLOCKED] Checking bot protection for scrape job: ${ scrapingCheck . jobId } , searchStartTime: ${ new Date ( searchStartTime ) . toISOString ( ) } ` ) ;
562- const botProtectionStats = await checkBotProtectionInScrapes (
563- scrapingCheck . jobId , // Scrape job ID for CloudWatch log querying
564- searchStartTime , // Search start time (onboard time or 24h ago)
565- context ,
566- ) ;
567-
568- if ( botProtectionStats && botProtectionStats . totalCount > 0 ) {
569- log . warn ( `[BOT-BLOCKED] Bot protection blocking scrapes for ${ siteUrl } - aborting task processing` ) ;
570-
571- // Get bot IPs from environment and send alert
572- const botIps = env . SPACECAT_BOT_IPS || '' ;
573- const allowlistInfo = formatAllowlistMessage ( botIps ) ;
574-
575- await say (
576- env ,
577- log ,
578- slackContext ,
579- formatBotProtectionSlackMessage ( {
580- siteUrl,
581- stats : botProtectionStats ,
582- totalUrlCount : scrapingCheck . results . length ,
583- allowlistIps : allowlistInfo . ips ,
584- allowlistUserAgent : allowlistInfo . userAgent ,
585- } ) ,
586- ) ;
587-
588- // Abort processing when bot protection detected
589- return ok ( {
590- message : `Task processing aborted: Bot protection detected for ${ siteUrl } ` ,
591- botProtectionDetected : true ,
592- blockedUrlCount : botProtectionStats . totalCount ,
593- } ) ;
594- }
595- }
596462 }
597463 } catch ( error ) {
598464 log . warn ( `Could not resolve canonical URL or parse siteUrl for data source checks: ${ siteUrl } ` , error ) ;
0 commit comments