mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-12 19:39:16 +08:00
Nick: python working
This commit is contained in:
parent
bb4808443c
commit
45e33563eb
@ -75,6 +75,7 @@ export interface FirecrawlDocument {
|
|||||||
html?: string;
|
html?: string;
|
||||||
rawHtml?: string;
|
rawHtml?: string;
|
||||||
links?: string[];
|
links?: string[];
|
||||||
|
extract?: Record<any, any>;
|
||||||
screenshot?: string;
|
screenshot?: string;
|
||||||
metadata: FirecrawlDocumentMetadata;
|
metadata: FirecrawlDocumentMetadata;
|
||||||
}
|
}
|
||||||
@ -344,12 +345,13 @@ export default class FirecrawlApp<T extends "v0" | "v1"> {
|
|||||||
Authorization: `Bearer ${this.apiKey}`,
|
Authorization: `Bearer ${this.apiKey}`,
|
||||||
} as AxiosRequestHeaders;
|
} as AxiosRequestHeaders;
|
||||||
let jsonData: any = { url, ...params };
|
let jsonData: any = { url, ...params };
|
||||||
if (jsonData?.extractorOptions?.extractionSchema) {
|
if (jsonData?.extractorOptions?.extractionSchema || jsonData?.extract?.schema) {
|
||||||
let schema = jsonData.extractorOptions.extractionSchema;
|
let schema = jsonData.extractorOptions?.extractionSchema || jsonData.extract?.schema;
|
||||||
// Check if schema is an instance of ZodSchema to correctly identify Zod schemas
|
// Check if schema is an instance of ZodSchema to correctly identify Zod schemas
|
||||||
if (schema instanceof z.ZodSchema) {
|
if (schema instanceof z.ZodSchema) {
|
||||||
schema = zodToJsonSchema(schema);
|
schema = zodToJsonSchema(schema);
|
||||||
}
|
}
|
||||||
|
if(this.version === 'v0') {
|
||||||
jsonData = {
|
jsonData = {
|
||||||
...jsonData,
|
...jsonData,
|
||||||
extractorOptions: {
|
extractorOptions: {
|
||||||
@ -358,6 +360,15 @@ export default class FirecrawlApp<T extends "v0" | "v1"> {
|
|||||||
mode: jsonData.extractorOptions.mode || "llm-extraction",
|
mode: jsonData.extractorOptions.mode || "llm-extraction",
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
|
} else {
|
||||||
|
jsonData = {
|
||||||
|
...jsonData,
|
||||||
|
extract: {
|
||||||
|
...jsonData.extract,
|
||||||
|
schema: schema,
|
||||||
|
},
|
||||||
|
};
|
||||||
|
}
|
||||||
}
|
}
|
||||||
try {
|
try {
|
||||||
const response: AxiosResponse = await axios.post(
|
const response: AxiosResponse = await axios.post(
|
||||||
|
@ -1,17 +1,18 @@
|
|||||||
import uuid
|
import uuid
|
||||||
from firecrawl.firecrawl import FirecrawlApp
|
from firecrawl.firecrawl import FirecrawlApp
|
||||||
|
|
||||||
app = FirecrawlApp(api_key="fc-YOUR_API_KEY")
|
app = FirecrawlApp(api_key="fc-")
|
||||||
|
|
||||||
# Scrape a website:
|
# Scrape a website:
|
||||||
scrape_result = app.scrape_url('firecrawl.dev')
|
scrape_result = app.scrape_url('firecrawl.dev')
|
||||||
print(scrape_result['markdown'])
|
print(scrape_result['markdown'])
|
||||||
|
|
||||||
# Crawl a website:
|
# Crawl a website:
|
||||||
idempotency_key = str(uuid.uuid4()) # optional idempotency key
|
crawl_result = app.crawl_url('docs.firecrawl.dev', {}, True, 2)
|
||||||
crawl_result = app.crawl_url('mendable.ai', {'crawlerOptions': {'excludes': ['blog/*']}}, True, 2, idempotency_key)
|
|
||||||
print(crawl_result)
|
print(crawl_result)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# LLM Extraction:
|
# LLM Extraction:
|
||||||
# Define schema to extract contents into using pydantic
|
# Define schema to extract contents into using pydantic
|
||||||
from pydantic import BaseModel, Field
|
from pydantic import BaseModel, Field
|
||||||
@ -27,18 +28,15 @@ class TopArticlesSchema(BaseModel):
|
|||||||
top: List[ArticleSchema] = Field(..., max_items=5, description="Top 5 stories")
|
top: List[ArticleSchema] = Field(..., max_items=5, description="Top 5 stories")
|
||||||
|
|
||||||
llm_extraction_result = app.scrape_url('https://news.ycombinator.com', {
|
llm_extraction_result = app.scrape_url('https://news.ycombinator.com', {
|
||||||
'extractorOptions': {
|
'formats': ['extract'],
|
||||||
'extractionSchema': TopArticlesSchema.model_json_schema(),
|
'extract': {
|
||||||
'mode': 'llm-extraction'
|
'schema': TopArticlesSchema.model_json_schema()
|
||||||
},
|
|
||||||
'pageOptions':{
|
|
||||||
'onlyMainContent': True
|
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
|
|
||||||
print(llm_extraction_result['llm_extraction'])
|
print(llm_extraction_result['extract'])
|
||||||
|
|
||||||
# Define schema to extract contents into using json schema
|
# # Define schema to extract contents into using json schema
|
||||||
json_schema = {
|
json_schema = {
|
||||||
"type": "object",
|
"type": "object",
|
||||||
"properties": {
|
"properties": {
|
||||||
@ -62,7 +60,10 @@ json_schema = {
|
|||||||
"required": ["top"]
|
"required": ["top"]
|
||||||
}
|
}
|
||||||
|
|
||||||
llm_extraction_result = app.scrape_url('https://news.ycombinator.com', {
|
app2 = FirecrawlApp(api_key="fc-", version="v0")
|
||||||
|
|
||||||
|
|
||||||
|
llm_extraction_result = app2.scrape_url('https://news.ycombinator.com', {
|
||||||
'extractorOptions': {
|
'extractorOptions': {
|
||||||
'extractionSchema': json_schema,
|
'extractionSchema': json_schema,
|
||||||
'mode': 'llm-extraction'
|
'mode': 'llm-extraction'
|
||||||
|
@ -58,20 +58,30 @@ class FirecrawlApp:
|
|||||||
|
|
||||||
# If there are additional params, process them
|
# If there are additional params, process them
|
||||||
if params:
|
if params:
|
||||||
# Initialize extractorOptions if present
|
if self.version == 'v0':
|
||||||
|
# Handle extractorOptions (for v0 compatibility)
|
||||||
extractor_options = params.get('extractorOptions', {})
|
extractor_options = params.get('extractorOptions', {})
|
||||||
# Check and convert the extractionSchema if it's a Pydantic model
|
if extractor_options:
|
||||||
if 'extractionSchema' in extractor_options:
|
if 'extractionSchema' in extractor_options and hasattr(extractor_options['extractionSchema'], 'schema'):
|
||||||
if hasattr(extractor_options['extractionSchema'], 'schema'):
|
|
||||||
extractor_options['extractionSchema'] = extractor_options['extractionSchema'].schema()
|
extractor_options['extractionSchema'] = extractor_options['extractionSchema'].schema()
|
||||||
# Ensure 'mode' is set, defaulting to 'llm-extraction' if not explicitly provided
|
|
||||||
extractor_options['mode'] = extractor_options.get('mode', 'llm-extraction')
|
extractor_options['mode'] = extractor_options.get('mode', 'llm-extraction')
|
||||||
# Update the scrape_params with the processed extractorOptions
|
|
||||||
scrape_params['extractorOptions'] = extractor_options
|
scrape_params['extractorOptions'] = extractor_options
|
||||||
|
|
||||||
# Include any other params directly at the top level of scrape_params
|
# Include any other params directly at the top level of scrape_params
|
||||||
for key, value in params.items():
|
for key, value in params.items():
|
||||||
if key != 'extractorOptions':
|
if key not in ['extractorOptions']:
|
||||||
|
scrape_params[key] = value
|
||||||
|
elif self.version == 'v1':
|
||||||
|
# Handle extract (for v1)
|
||||||
|
extract = params.get('extract', {})
|
||||||
|
if extract:
|
||||||
|
if 'schema' in extract and hasattr(extract['schema'], 'schema'):
|
||||||
|
extract['schema'] = extract['schema'].schema()
|
||||||
|
scrape_params['extract'] = extract
|
||||||
|
|
||||||
|
# Include any other params directly at the top level of scrape_params
|
||||||
|
for key, value in params.items():
|
||||||
|
if key not in ['extract']:
|
||||||
scrape_params[key] = value
|
scrape_params[key] = value
|
||||||
|
|
||||||
endpoint = f'/{self.version}/scrape'
|
endpoint = f'/{self.version}/scrape'
|
||||||
|
Loading…
x
Reference in New Issue
Block a user