Skip to content
Permalink

Comparing changes

Choose two branches to see what’s changed or to start a new pull request. If you need to, you can also or learn more about diff comparisons.

Open a pull request

Create a new pull request by comparing changes across two branches. If you need to, you can also . Learn more about diff comparisons here.
base repository: webrecorder/browsertrix
Failed to load repositories. Confirm that selected base ref is valid, then try again.
Loading
base: v1.14.7
Choose a base ref
...
head repository: webrecorder/browsertrix
Failed to load repositories. Confirm that selected head ref is valid, then try again.
Loading
compare: v1.14.8
Choose a head ref
  • 3 commits
  • 22 files changed
  • 4 contributors

Commits on Mar 27, 2025

  1. task: Display built-in behaviors as list (#2518)

    - Displays built-in behaviors as single field in workflow settings
    - Standardizes how "None" is displayed in workflow settings
    - Refactors behavior names into enum
    SuaYoo authored Mar 27, 2025

    Verified

    This commit was created on GitHub.com and signed with GitHub’s verified signature.
    Copy the full SHA
    df8c80f View commit details

Commits on Mar 31, 2025

  1. support overriding crawler image pull policy per channel (#2523)

    - add 'imagePullPolicy' field to each crawler channel declaration
    - if unset, defaults to the setting in the existing
    'crawler_image_pull_policy' field.
    
    fixes #2522
    
    ---------
    
    Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
    ikreymer and tw4l authored Mar 31, 2025

    Verified

    This commit was created on GitHub.com and signed with GitHub’s verified signature.
    Copy the full SHA
    62e47a8 View commit details
  2. version: update to 1.14.8

    ikreymer committed Mar 31, 2025
    Copy the full SHA
    b5b4c4d View commit details
18 changes: 18 additions & 0 deletions backend/btrixcloud/crawlconfigs.py
Original file line number Diff line number Diff line change
@@ -85,6 +85,7 @@ class CrawlConfigOps:

crawler_channels: CrawlerChannels
crawler_images_map: dict[str, str]
crawler_image_pull_policy_map: dict[str, str]

def __init__(
self,
@@ -108,6 +109,9 @@ def __init__(
self.coll_ops = cast(CollectionOps, None)

self.default_filename_template = os.environ["DEFAULT_CRAWL_FILENAME_TEMPLATE"]
self.default_crawler_image_pull_policy = os.environ.get(
"DEFAULT_CRAWLER_IMAGE_PULL_POLICY", "IfNotPresent"
)

self.router = APIRouter(
prefix="/crawlconfigs",
@@ -118,13 +122,18 @@ def __init__(
self._file_rx = re.compile("\\W+")

self.crawler_images_map = {}
self.crawler_image_pull_policy_map = {}
channels = []
with open(os.environ["CRAWLER_CHANNELS_JSON"], encoding="utf-8") as fh:
crawler_list = json.loads(fh.read())
for channel_data in crawler_list:
channel = CrawlerChannel(**channel_data)
channels.append(channel)
self.crawler_images_map[channel.id] = channel.image
if channel.imagePullPolicy:
self.crawler_image_pull_policy_map[channel.id] = (
channel.imagePullPolicy
)

self.crawler_channels = CrawlerChannels(channels=channels)

@@ -960,6 +969,15 @@ def get_channel_crawler_image(
"""Get crawler image name by id"""
return self.crawler_images_map.get(crawler_channel or "")

def get_channel_crawler_image_pull_policy(
self, crawler_channel: Optional[str]
) -> str:
"""Get crawler image name by id"""
return (
self.crawler_image_pull_policy_map.get(crawler_channel or "")
or self.default_crawler_image_pull_policy
)

def get_crawler_proxies_map(self) -> dict[str, CrawlerProxy]:
"""Load CrawlerProxy mapping from config"""
proxies_last_update_path = os.environ["CRAWLER_PROXIES_LAST_UPDATE"]
2 changes: 2 additions & 0 deletions backend/btrixcloud/crawlmanager.py
Original file line number Diff line number Diff line change
@@ -33,6 +33,7 @@ async def run_profile_browser(
url: str,
storage: StorageRef,
crawler_image: str,
image_pull_policy: str,
baseprofile: str = "",
profile_filename: str = "",
proxy_id: str = "",
@@ -57,6 +58,7 @@ async def run_profile_browser(
"vnc_password": secrets.token_hex(16),
"expire_time": date_to_str(dt_now() + timedelta(seconds=30)),
"crawler_image": crawler_image,
"image_pull_policy": image_pull_policy,
"proxy_id": proxy_id or DEFAULT_PROXY_ID,
}

1 change: 1 addition & 0 deletions backend/btrixcloud/models.py
Original file line number Diff line number Diff line change
@@ -603,6 +603,7 @@ class CrawlerChannel(BaseModel):

id: str
image: str
imagePullPolicy: Optional[str] = None


# ============================================================================
5 changes: 5 additions & 0 deletions backend/btrixcloud/operator/crawls.py
Original file line number Diff line number Diff line change
@@ -279,6 +279,11 @@ async def sync_crawls(self, data: MCSyncData):
)

params["crawler_image"] = status.crawlerImage
pull_policy = self.crawl_config_ops.get_channel_crawler_image_pull_policy(
crawl.crawler_channel
)
if pull_policy:
params["crawler_image_pull_policy"] = pull_policy

if crawl.proxy_id and not crawl.is_qa:
proxy = self.crawl_config_ops.get_crawler_proxy(crawl.proxy_id)
3 changes: 3 additions & 0 deletions backend/btrixcloud/operator/profiles.py
Original file line number Diff line number Diff line change
@@ -45,6 +45,9 @@ async def sync_profile_browsers(self, data: MCSyncData):
params["storage_secret"] = storage_secret
params["profile_filename"] = spec.get("profileFilename", "")
params["crawler_image"] = spec["crawlerImage"]
pull_policy = spec.get("imagePullPolicy")
if pull_policy:
params["crawler_image_pull_policy"] = pull_policy

proxy_id = spec.get("proxyId")
if proxy_id:
5 changes: 5 additions & 0 deletions backend/btrixcloud/profiles.py
Original file line number Diff line number Diff line change
@@ -110,6 +110,10 @@ async def create_new_browser(
if not crawler_image:
raise HTTPException(status_code=404, detail="crawler_not_found")

image_pull_policy = self.crawlconfigs.get_channel_crawler_image_pull_policy(
profile_launch.crawlerChannel
)

# use either specified proxyId or if none, use proxyId from existing profile
proxy_id = profile_launch.proxyId or prev_proxy_id

@@ -122,6 +126,7 @@ async def create_new_browser(
url=str(profile_launch.url),
storage=org.storage,
crawler_image=crawler_image,
image_pull_policy=image_pull_policy,
baseprofile=prev_profile_id,
profile_filename=prev_profile_path,
proxy_id=proxy_id,
2 changes: 1 addition & 1 deletion backend/btrixcloud/version.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
"""current version"""

__version__ = "1.14.7"
__version__ = "1.14.8"
2 changes: 1 addition & 1 deletion chart/Chart.yaml
Original file line number Diff line number Diff line change
@@ -5,7 +5,7 @@ type: application
icon: https://webrecorder.net/assets/icon.png

# Browsertrix and Chart Version
version: v1.14.7
version: v1.14.8

dependencies:
- name: btrix-admin-logging
1 change: 1 addition & 0 deletions chart/app-templates/profile_job.yaml
Original file line number Diff line number Diff line change
@@ -23,6 +23,7 @@ spec:

storageName: "{{ storage_name }}"
crawlerImage: "{{ crawler_image }}"
imagePullPolicy: "{{ image_pull_policy }}"

startUrl: "{{ url }}"
profileFilename: "{{ profile_filename }}"
2 changes: 2 additions & 0 deletions chart/examples/local-config.yaml
Original file line number Diff line number Diff line change
@@ -22,10 +22,12 @@
# crawler_channels:
# - id: default
# image: "docker.io/webrecorder/browsertrix-crawler:latest"
# imagePullPolicy: Always
#
# # Add, remove, or edit additional crawler release channels for example:
# - id: custom_version
# image: "<DOCKER IMAGE>"
# imagePullPolicy: IfNotPresent # optional

# overrides to use existing images in local Docker, otherwise will pull from repository
# backend_pull_policy: "Never"
2 changes: 2 additions & 0 deletions chart/templates/configmap.yaml
Original file line number Diff line number Diff line change
@@ -34,6 +34,8 @@ data:

DEFAULT_CRAWL_FILENAME_TEMPLATE: "{{ .Values.default_crawl_filename_template }}"

DEFAULT_CRAWLER_IMAGE_PULL_POLICY: "{{ .Values.crawler_pull_policy }}"

MAX_PAGES_PER_CRAWL: "{{ .Values.max_pages_per_crawl | default 0 }}"

IDLE_TIMEOUT: "{{ .Values.profile_browser_idle_seconds | default 60 }}"
13 changes: 8 additions & 5 deletions chart/values.yaml
Original file line number Diff line number Diff line change
@@ -103,8 +103,8 @@ replica_deletion_delay_days: 0

# API Image
# =========================================
backend_image: "docker.io/webrecorder/browsertrix-backend:1.14.7"
backend_pull_policy: "Always"
backend_image: "docker.io/webrecorder/browsertrix-backend:1.14.8"
backend_pull_policy: "IfNotPresent"

backend_password_secret: "PASSWORD!"

@@ -161,8 +161,8 @@ backend_avg_memory_threshold: 95

# Nginx Image
# =========================================
frontend_image: "docker.io/webrecorder/browsertrix-frontend:1.14.7"
frontend_pull_policy: "Always"
frontend_image: "docker.io/webrecorder/browsertrix-frontend:1.14.8"
frontend_pull_policy: "IfNotPresent"

frontend_cpu: "10m"

@@ -237,12 +237,15 @@ redis_storage: "3Gi"
crawler_channels:
- id: default
image: "docker.io/webrecorder/browsertrix-crawler:latest"
imagePullPolicy: Always

# Add, remove, or edit additional crawler versions below, for example:
# - id: custom_version
# image: "<DOCKER IMAGE>"
# imagePullPolicy: Always|IfNotPresent|Never (optional, defaults to crawler_pull_policy)

crawler_pull_policy: "Always"
# default crawler pull policy if not set per channel
crawler_pull_policy: "IfNotPresent"

crawler_namespace: "crawlers"

4 changes: 4 additions & 0 deletions frontend/docs/docs/deploy/customization.md
Original file line number Diff line number Diff line change
@@ -18,6 +18,7 @@ The `crawler_channels` setting is used to specify the [_Crawler Release Channel_
crawler_channels:
- id: default
image: "docker.io/webrecorder/browsertrix-crawler:latest"
imagePullPolicy: Always # optional
```
This can be extended with additional channels. For example, here is what the value would look like adding a new x.y.z release of Browsertrix Crawler with the id `testing`:
@@ -28,8 +29,11 @@ crawler_channels:
image: "docker.io/webrecorder/browsertrix-crawler:latest"
- id: testing
image: "docker.io/webrecorder/browsertrix-crawler:x.y.z"
imagePullPolicy: IfNotPresent
```

The `imagePullPolicy` per channel is optional. If not set, the value set in `crawler_pull_policy` is used as the default.

## Storage

The `storage` setting is used to specify primary and replica storage for a Browsertrix deployment. All configured storage options must be S3-compatible buckets. At minimum, there must be one configured storage option, which includes a `is_default_primary: true`.
2 changes: 1 addition & 1 deletion frontend/package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "browsertrix-frontend",
"version": "1.14.7",
"version": "1.14.8",
"main": "index.ts",
"license": "AGPL-3.0-or-later",
"dependencies": {
45 changes: 21 additions & 24 deletions frontend/src/components/ui/config-details.ts
Original file line number Diff line number Diff line change
@@ -8,7 +8,13 @@ import capitalize from "lodash/fp/capitalize";
import RegexColorize from "regex-colorize";

import { BtrixElement } from "@/classes/BtrixElement";
import type { CrawlConfig, Seed, SeedConfig } from "@/pages/org/types";
import { none, notSpecified } from "@/layouts/empty";
import {
Behavior,
type CrawlConfig,
type Seed,
type SeedConfig,
} from "@/pages/org/types";
import { labelFor } from "@/strings/crawl-workflows/labels";
import scopeTypeLabel from "@/strings/crawl-workflows/scopeType";
import sectionStrings from "@/strings/crawl-workflows/section";
@@ -162,22 +168,15 @@ export class ConfigDetails extends BtrixElement {
heading: sectionStrings.behaviors,
renderDescItems: (seedsConfig) => html`
${this.renderSetting(
labelFor.autoscrollBehavior,
seedsConfig?.behaviors &&
!seedsConfig.behaviors.includes("autoscroll")
? msg("Disabled")
: html`<span class="text-neutral-400"
>${msg("Enabled (default)")}</span
>`,
)}
${this.renderSetting(
labelFor.autoclickBehavior,
seedsConfig?.behaviors &&
seedsConfig.behaviors.includes("autoclick")
? msg("Enabled")
: html`<span class="text-neutral-400"
>${msg("Disabled (default)")}</span
>`,
labelFor.behaviors,
[
seedsConfig?.behaviors?.includes(Behavior.AutoScroll) &&
labelFor.autoscrollBehavior,
seedsConfig?.behaviors?.includes(Behavior.AutoClick) &&
labelFor.autoclickBehavior,
]
.filter((v) => v)
.join(", ") || none,
)}
${this.renderSetting(
labelFor.pageLoadTimeoutSeconds,
@@ -424,7 +423,7 @@ export class ConfigDetails extends BtrixElement {
)}
</ul>
`
: msg("None"),
: none,
true,
),
)}
@@ -463,7 +462,7 @@ export class ConfigDetails extends BtrixElement {
})}
</ul>
`
: msg("None"),
: none,
true,
)}
${when(
@@ -477,7 +476,7 @@ export class ConfigDetails extends BtrixElement {
</btrix-queue-exclusion-table>
</div>
`,
() => this.renderSetting(msg("Exclusions"), msg("None")),
() => this.renderSetting(msg("Exclusions"), none),
)}
`;
};
@@ -490,11 +489,9 @@ export class ConfigDetails extends BtrixElement {
} else if (typeof value === "boolean") {
content = value ? msg("Yes") : msg("No");
} else if (Array.isArray(value) && !value.length) {
content = html`<span class="text-neutral-400">${msg("None")}</span>`;
content = none;
} else if (typeof value !== "number" && !value) {
content = html`<span class="text-neutral-400"
>${msg("Not specified")}</span
>`;
content = notSpecified;
}
return html`
<btrix-desc-list-item label=${label} class=${breakAll ? "break-all" : ""}>
32 changes: 18 additions & 14 deletions frontend/src/features/crawl-workflows/workflow-editor.ts
Original file line number Diff line number Diff line change
@@ -62,7 +62,12 @@ import { labelFor } from "@/strings/crawl-workflows/labels";
import scopeTypeLabels from "@/strings/crawl-workflows/scopeType";
import sectionStrings from "@/strings/crawl-workflows/section";
import { AnalyticsTrackEvent } from "@/trackEvents";
import { ScopeType, type Seed, type WorkflowParams } from "@/types/crawler";
import {
Behavior,
ScopeType,
type Seed,
type WorkflowParams,
} from "@/types/crawler";
import type { UnderlyingFunction } from "@/types/utils";
import { NewWorkflowOnlyScopeType } from "@/types/workflow";
import { track } from "@/utils/analytics";
@@ -111,11 +116,10 @@ type ProgressState = {
tabs: Tabs;
};
const DEFAULT_BEHAVIORS = [
"autoscroll",
"autoplay",
"autofetch",
"siteSpecific",
];
Behavior.AutoPlay,
Behavior.AutoFetch,
Behavior.SiteSpecific,
] as const;
const formName = "newJobConfig" as const;
const panelSuffix = "--panel" as const;

@@ -1182,7 +1186,7 @@ https://archiveweb.page/images/${"logo.svg"}`}

private renderBehaviors() {
return html`
${this.renderSectionHeading(msg("Built-in Behaviors"))}
${this.renderSectionHeading(labelFor.behaviors)}
${inputCol(
html`<sl-checkbox
name="autoscrollBehavior"
@@ -2206,17 +2210,17 @@ https://archiveweb.page/images/${"logo.svg"}`}
}

private setBehaviors(): string {
let behaviors = (
this.formState.autoscrollBehavior
? DEFAULT_BEHAVIORS
: DEFAULT_BEHAVIORS.slice(1)
).join(",");
const behaviors: Behavior[] = [...DEFAULT_BEHAVIORS];

if (this.formState.autoscrollBehavior) {
behaviors.unshift(Behavior.AutoScroll);
}

if (this.formState.autoclickBehavior) {
behaviors += ",autoclick";
behaviors.push(Behavior.AutoClick);
}

return behaviors;
return behaviors.join(",");
}

private parseUrlListConfig(): Pick<
11 changes: 11 additions & 0 deletions frontend/src/layouts/empty.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
import { html } from "lit";

import { stringFor } from "@/strings/ui";

export const notSpecified = html`<span class="text-neutral-400">
${stringFor.notSpecified}
</span>`;

export const none = html`<span class="text-neutral-400">
${stringFor.none}
</span>`;
Loading