Skip to content

Commit 262dfb1

Browse files
refactor to simplify logic
1 parent 4d02b7f commit 262dfb1

File tree

10 files changed

+777
-887
lines changed

10 files changed

+777
-887
lines changed

package-lock.json

Lines changed: 327 additions & 79 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,7 @@
8484
"@adobe/spacecat-shared-rum-api-client": "2.40.4",
8585
"@adobe/spacecat-shared-scrape-client": "2.3.6",
8686
"@adobe/spacecat-shared-slack-client": "1.5.32",
87-
"@adobe/spacecat-shared-utils": "https://gist.github.com/tkotthakota-adobe/5c0d682a53ec36acd705d99dd6f1f144/raw/746baa2999c74031803aab641f1f81ba3ea1736f/adobe-spacecat-shared-utils-1.86.0.tgz",
87+
"@adobe/spacecat-shared-utils": "https://gist.github.com/tkotthakota-adobe/655342ca8fe806db4e508761465becb2/raw/72a22c19f8a596650493990d8443238466f0d224/adobe-spacecat-shared-utils-1.87.0.tgz",
8888
"@aws-sdk/client-s3": "3.966.0",
8989
"@aws-sdk/client-cloudwatch-logs": "3.966.0",
9090
"@aws-sdk/client-lambda": "3.966.0",

src/tasks/opportunity-status-processor/handler.js

Lines changed: 44 additions & 178 deletions
Original file line numberDiff line numberDiff line change
@@ -11,18 +11,20 @@
1111
*/
1212

1313
import { ok } from '@adobe/spacecat-shared-http-utils';
14-
import { CloudWatchLogsClient, FilterLogEventsCommand } from '@aws-sdk/client-cloudwatch-logs';
1514
import RUMAPIClient from '@adobe/spacecat-shared-rum-api-client';
1615
import GoogleClient from '@adobe/spacecat-shared-google-client';
1716
import { ScrapeClient } from '@adobe/spacecat-shared-scrape-client';
18-
import { resolveCanonicalUrl, formatAllowlistMessage } from '@adobe/spacecat-shared-utils';
19-
import { say, formatBotProtectionSlackMessage } from '../../utils/slack-utils.js';
20-
import { queryBotProtectionLogs, aggregateBotProtectionStats } from '../../utils/cloudwatch-utils.js';
17+
import { resolveCanonicalUrl } from '@adobe/spacecat-shared-utils';
18+
import {
19+
checkAndAlertBotProtection,
20+
checkAuditExecution,
21+
getAuditFailureReason,
22+
} from '../../utils/cloudwatch-utils.js';
23+
import { say } from '../../utils/slack-utils.js';
2124
import { getOpportunitiesForAudit } from './audit-opportunity-map.js';
2225
import { OPPORTUNITY_DEPENDENCY_MAP } from './opportunity-dependency-map.js';
2326

2427
const TASK_TYPE = 'opportunity-status-processor';
25-
const AUDIT_WORKER_LOG_GROUP = '/aws/lambda/spacecat-services--audit-worker';
2628

2729
/**
2830
* Checks if RUM is available for a domain by attempting to get a domainkey
@@ -128,9 +130,10 @@ function getOpportunityTitle(opportunityType) {
128130
*
129131
* @param {string} baseUrl - The base URL to check
130132
* @param {object} context - The context object with env and log
133+
* @param {number} [onboardStartTime] - Optional onboard start timestamp to filter jobs
131134
* @returns {Promise<{available: boolean, results: Array}>} Scraping availability and URL results
132135
*/
133-
async function isScrapingAvailable(baseUrl, context) {
136+
async function isScrapingAvailable(baseUrl, context, onboardStartTime) {
134137
const { log } = context;
135138

136139
try {
@@ -152,8 +155,20 @@ async function isScrapingAvailable(baseUrl, context) {
152155
return { available: false, results: [] };
153156
}
154157

155-
// Sort jobs by date (latest first) - assuming jobs have a timestamp field
156-
const sortedJobs = jobs.sort((a, b) => {
158+
// Filter jobs created after onboardStartTime
159+
const filteredJobs = jobs.filter((job) => {
160+
const jobTimestamp = new Date(job.startedAt || job.createdAt || 0).getTime();
161+
return jobTimestamp >= onboardStartTime;
162+
});
163+
log.info(`Filtered ${filteredJobs.length} jobs created after onboardStartTime from ${jobs.length} total jobs`);
164+
165+
if (filteredJobs.length === 0) {
166+
log.info(`No scrape jobs found for ${baseUrl} after onboardStartTime ${new Date(onboardStartTime).toISOString()}`);
167+
return { available: false, results: [] };
168+
}
169+
170+
// Sort jobs by date (latest first)
171+
const sortedJobs = filteredJobs.sort((a, b) => {
157172
const dateA = new Date(b.startedAt || b.createdAt || 0);
158173
const dateB = new Date(a.startedAt || a.createdAt || 0);
159174
return dateA - dateB;
@@ -208,133 +223,6 @@ async function isScrapingAvailable(baseUrl, context) {
208223
* @param {object} context - The context object with log
209224
* @returns {object|null} Bot protection details if detected, null otherwise
210225
*/
211-
/**
212-
* Detects bot protection by checking CloudWatch logs for bot protection events.
213-
*
214-
* Content Scraper logs bot protection events to CloudWatch, making logs the source of truth.
215-
* This function uses searchStartTime to query CloudWatch logs.
216-
*
217-
* @param {string} scrapeJobId - The scrape job ID for CloudWatch log querying
218-
* @param {number} searchStartTime - Search start timestamp (ms) to limit CloudWatch query window
219-
* @param {object} context - The context object with env, log
220-
* @returns {Promise<object|null>} Bot protection statistics or null
221-
*/
222-
async function checkBotProtectionInScrapes(
223-
scrapeJobId,
224-
searchStartTime,
225-
context,
226-
) {
227-
const { log } = context;
228-
229-
if (!scrapeJobId || !searchStartTime) {
230-
log.debug('[BOT-BLOCKED] Skipping bot protection check: missing scrapeJobId or searchStartTime');
231-
return null;
232-
}
233-
234-
log.info(`[BOT-BLOCKED] Querying CloudWatch logs for bot protection from ${new Date(searchStartTime).toISOString()}`);
235-
const logEvents = await queryBotProtectionLogs(scrapeJobId, context, searchStartTime);
236-
237-
if (logEvents.length === 0) {
238-
// No bot protection detected in logs
239-
return null;
240-
}
241-
242-
// Parse and aggregate bot protection statistics from logs
243-
const botProtectionStats = aggregateBotProtectionStats(logEvents);
244-
log.warn(`[BOT-BLOCKED] Bot protection detected: ${botProtectionStats.totalCount} URLs blocked (from CloudWatch logs) for job ${scrapeJobId}`);
245-
246-
return botProtectionStats;
247-
}
248-
249-
/**
250-
* Searches CloudWatch logs for audit execution
251-
* @param {string} auditType - The audit type to search for
252-
* @param {string} siteId - The site ID
253-
* @param {number} onboardStartTime - The onboarding start timestamp
254-
* @param {object} context - The context object
255-
* @returns {Promise<boolean>} Whether the audit was executed
256-
*/
257-
async function checkAuditExecution(auditType, siteId, onboardStartTime, context) {
258-
const { log, env } = context;
259-
const logGroupName = AUDIT_WORKER_LOG_GROUP;
260-
261-
try {
262-
const cloudWatchClient = new CloudWatchLogsClient({ region: env.AWS_REGION || 'us-east-1' });
263-
const filterPattern = `"Received ${auditType} audit request for: ${siteId}"`;
264-
265-
// Add small buffer before onboardStartTime to account for clock skew and processing delays
266-
// The audit log should be after onboardStartTime, but we add a small buffer for safety
267-
const bufferMs = 60 * 1000; // 1 minute
268-
const searchStartTime = onboardStartTime ? onboardStartTime - bufferMs : undefined;
269-
270-
const command = new FilterLogEventsCommand({
271-
logGroupName,
272-
filterPattern,
273-
startTime: searchStartTime,
274-
endTime: Date.now(),
275-
});
276-
277-
const response = await cloudWatchClient.send(command);
278-
const found = response.events && response.events.length > 0;
279-
280-
return found;
281-
} catch (error) {
282-
log.error(`Error checking audit execution for ${auditType}:`, error);
283-
return false;
284-
}
285-
}
286-
287-
/**
288-
* Searches CloudWatch logs for audit failure reason
289-
* @param {string} auditType - The audit type to search for
290-
* @param {string} siteId - The site ID
291-
* @param {number} onboardStartTime - The onboarding start timestamp
292-
* @param {object} context - The context object
293-
* @returns {Promise<string|null>} The failure reason or null if not found
294-
*/
295-
async function getAuditFailureReason(auditType, siteId, onboardStartTime, context) {
296-
const { log, env } = context;
297-
const logGroupName = AUDIT_WORKER_LOG_GROUP;
298-
299-
try {
300-
const cloudWatchClient = new CloudWatchLogsClient({ region: env.AWS_REGION || 'us-east-1' });
301-
const filterPattern = `"${auditType} audit for ${siteId} failed"`;
302-
303-
// Add small buffer before onboardStartTime to account for clock skew and processing delays
304-
const bufferMs = 30 * 1000; // 30 seconds
305-
const searchStartTime = onboardStartTime ? onboardStartTime - bufferMs : undefined;
306-
307-
const command = new FilterLogEventsCommand({
308-
logGroupName,
309-
filterPattern,
310-
startTime: searchStartTime,
311-
endTime: Date.now(),
312-
});
313-
314-
const response = await cloudWatchClient.send(command);
315-
316-
if (response.events && response.events.length > 0) {
317-
// Extract reason from the message
318-
const { message } = response.events[0];
319-
const reasonMatch = message.match(/Reason:\s*([^]+?)(?:\s+at\s|$)/);
320-
if (reasonMatch && reasonMatch[1]) {
321-
return reasonMatch[1].trim();
322-
}
323-
// Fallback: return entire message if "Reason:" pattern not found
324-
return message.trim();
325-
}
326-
327-
return null;
328-
/* c8 ignore start */
329-
// Defensive error handling: Difficult to test as requires CloudWatch API to throw errors.
330-
// Would need complex AWS SDK mocking infrastructure for marginal coverage gain.
331-
} catch (error) {
332-
log.error(`Error checking audit failure for ${auditType}:`, error);
333-
return null;
334-
}
335-
/* c8 ignore stop */
336-
}
337-
338226
/**
339227
* Analyzes missing opportunities and determines the root cause
340228
* @param {Array<string>} missingOpportunities - Array of missing opportunity types
@@ -529,7 +417,27 @@ export async function runOpportunityStatusProcessor(message, context) {
529417
}
530418

531419
if (needsScraping) {
532-
const scrapingCheck = await isScrapingAvailable(siteUrl, context);
420+
// Check for bot protection FIRST before fetching scrape results
421+
const botProtectionStats = await checkAndAlertBotProtection({
422+
siteId,
423+
siteUrl,
424+
searchStartTime: onboardStartTime,
425+
slackContext,
426+
context,
427+
});
428+
429+
// Abort processing if bot protection detected
430+
if (botProtectionStats && botProtectionStats.totalCount > 0) {
431+
log.warn(`[BOT-BLOCKED] Bot protection blocking scrapes for ${siteUrl}`);
432+
return ok({
433+
message: `Bot protection detected for ${siteUrl}`,
434+
botProtectionDetected: true,
435+
blockedUrlCount: botProtectionStats.totalCount,
436+
});
437+
}
438+
439+
// Only check scraping availability if no bot protection detected
440+
const scrapingCheck = await isScrapingAvailable(siteUrl, context, onboardStartTime);
533441
scrapingAvailable = scrapingCheck.available;
534442

535443
// Send Slack notification with scraping statistics if available
@@ -551,48 +459,6 @@ export async function runOpportunityStatusProcessor(message, context) {
551459
await say(env, log, slackContext, statsMessage);
552460
}
553461
}
554-
555-
// Check for bot protection via CloudWatch logs
556-
log.info(`[BOT-BLOCKED] Bot protection check conditions: jobId=${!!scrapingCheck.jobId}, slackContext=${!!slackContext}, onboardStartTime=${!!onboardStartTime}`);
557-
558-
if (scrapingCheck.jobId && slackContext) {
559-
// Use onboardStartTime if available, otherwise use a reasonable fallback (24 hours ago)
560-
const searchStartTime = onboardStartTime || (Date.now() - (24 * 60 * 60 * 1000));
561-
log.info(`[BOT-BLOCKED] Checking bot protection for scrape job: ${scrapingCheck.jobId}, searchStartTime: ${new Date(searchStartTime).toISOString()}`);
562-
const botProtectionStats = await checkBotProtectionInScrapes(
563-
scrapingCheck.jobId, // Scrape job ID for CloudWatch log querying
564-
searchStartTime, // Search start time (onboard time or 24h ago)
565-
context,
566-
);
567-
568-
if (botProtectionStats && botProtectionStats.totalCount > 0) {
569-
log.warn(`[BOT-BLOCKED] Bot protection blocking scrapes for ${siteUrl} - aborting task processing`);
570-
571-
// Get bot IPs from environment and send alert
572-
const botIps = env.SPACECAT_BOT_IPS || '';
573-
const allowlistInfo = formatAllowlistMessage(botIps);
574-
575-
await say(
576-
env,
577-
log,
578-
slackContext,
579-
formatBotProtectionSlackMessage({
580-
siteUrl,
581-
stats: botProtectionStats,
582-
totalUrlCount: scrapingCheck.results.length,
583-
allowlistIps: allowlistInfo.ips,
584-
allowlistUserAgent: allowlistInfo.userAgent,
585-
}),
586-
);
587-
588-
// Abort processing when bot protection detected
589-
return ok({
590-
message: `Task processing aborted: Bot protection detected for ${siteUrl}`,
591-
botProtectionDetected: true,
592-
blockedUrlCount: botProtectionStats.totalCount,
593-
});
594-
}
595-
}
596462
}
597463
} catch (error) {
598464
log.warn(`Could not resolve canonical URL or parse siteUrl for data source checks: ${siteUrl}`, error);

0 commit comments

Comments
 (0)