o
    }!gn`                  
   @   s  d dl Z d dlZd dlZd dlmZmZmZ d dlmZ d dl	m
Z
mZmZmZmZmZmZ d dlZd dlZd dlmZ d dlmZmZmZmZmZmZmZmZmZm Z m!Z!m"Z"m#Z#m$Z$m%Z% d dl&m'Z' d dl(m)Z)m*Z* d d	l+m,Z, d d
l-m.Z. d dl/m0Z0m1Z1 d dl2m3Z3 d dl4m5Z5 edZ6ee7ee8ef Z9ee)ee f Z:ee#j;e j<dZ=G dd dZ>G dd de,Z?e@dkr	d dlAmBZB eB  eeCjDjDd d ZEe? ZFzeFjGddZHW n eIy   eFJddddiddiddZHY nw eHKeEd d  ZLeFMeHjN eOeL dS dS )!    N)BufferedIOBaseBufferedReaderBytesIO)Path)ListOptionalTypeUnion	CoroutineAnyTypeVar)	BaseModel)ExtractAgentExtractAgentCreateExtractConfig
ExtractJobExtractJobCreate
ExtractRunExtractSchemaValidateRequestExtractAgentUpdateFileExtractMode
StatusEnumProjectExtractTargetLlamaExtractSettingsPaginatedExtractRunsResponse)AsyncLlamaCloud)JSONObjectTypeaugment_async_errors)BaseComponent)run_jobs)FieldPrivateAttr)DEFAULT_BASE_URL)ThreadPoolExecutorT)extraction_targetextraction_modec                   @   s:  e Zd ZdZ									dGd	ed
edee dee dededede	de	dee	 dee
 fddZdeeeef defddZedefddZedefddZedefddZejd eddfd!dZedefd"d#Zejd$eddfd%d#Zd&edefd'd(Zd)edee fd*d+ZdHd,d-Zd.eee e f d/e!dee"e e" f fd0d1Z#d.eee e f dee"e e" f fd2d3Z$d.eee e f deee e f fd4d5Z%d.eee e f deee e f fd6d7Z&d)ede"fd8d9Z'd)edefd:d;Z(d<eddfd=d>Z)	@dIdAedBede*fdCdDZ+defdEdFZ,dS )JExtractionAgentzTClass representing a single extraction agent with methods for extraction operations.N        TF<   clientagent
project_idorganization_idcheck_intervalmax_timeoutnum_workersshow_progressverboseverifyhttpx_timeoutc                 C   sp   || _ || _|| _|| _|| _|| _|| _|| _|
| _|| _	|	| _
d | _d | _ttdt p/dd d| _d S )N
   r*   r,   max_workers)_client_agent_project_id_organization_idr2   r3   r4   r5   r7   r8   _verbose_data_schema_configr%   minos	cpu_count_thread_pool)selfr.   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8    rH   f/mnt/skqttb/ctump_chatbot/chatbot/lib/python3.10/site-packages/llama_cloud_services/extract/extract.py__init__1   s    zExtractionAgent.__init__cororeturnc                    $   dt f fdd}j| S )=Run coroutine in a separate thread to avoid event loop issuesrL   c                         dt f fdd} t|  S )NrL   c               	      s   j jj} tjjjd4 I d H "}|j j_z I d H W | j j_W  d   I d H  S | j j_w 1 I d H s;w   Y  d S )Nr7   timeout)r<   _client_wrapperhttpx_clienthttpxAsyncClientr7   r8   )original_clientr.   rK   rG   rH   rI   wrapped_coroT   s   




zFExtractionAgent._run_in_thread.<locals>.run_coro.<locals>.wrapped_coror&   asynciorunrX   rW   rH   rI   run_coroS   s   z0ExtractionAgent._run_in_thread.<locals>.run_coror&   rF   submitresultrG   rK   r]   rH   rW   rI   _run_in_threadP   s   zExtractionAgent._run_in_threadc                 C      | j jS N)r=   idrG   rH   rH   rI   re   i      zExtractionAgent.idc                 C   rc   rd   )r=   namerf   rH   rH   rI   rh   m   rg   zExtractionAgent.namec                 C      | j s| jjS | j S rd   )rA   r=   data_schemarf   rH   rH   rI   rj   q      zExtractionAgent.data_schemarj   c                 C   s^   t |tr|}nt |trt|tr| }ntd| | jj	j
t|dd}|j| _d S )N;data_schema must be either a dictionary or a Pydantic model)rj   request)
isinstancedicttype
issubclassr   model_json_schema
ValueErrorrb   r<   llama_extractvalidate_extraction_schemar   rj   rA   )rG   rj   processed_schemavalidated_schemarH   rH   rI   rj   u   s   

c                 C   ri   rd   )rB   r=   configrf   rH   rH   rI   ry      rk   zExtractionAgent.configry   c                 C   s
   || _ d S rd   )rB   )rG   ry   rH   rH   rI   ry      s   

file_inputc              	      s   t |tr	|}nt |trt|}nt |ttfr t|d}ntdz| jj	j
| j|dI dH W t |tr<|  S S t |trG|  w w )zUpload a file for extraction.rbzJfile_input must be either a file path string, file bytes, or buffer object)r0   upload_fileN)ro   r   bytesr   strr   openrt   r<   filesr|   r>   r   close)rG   rz   r|   rH   rH   rI   _upload_file   s&   





zExtractionAgent._upload_filejob_idc                    s   t  }d}	 t| jI dH  |d7 }| jjj|dI dH }|jt	j
kr1| jjj|dI dH S |jt	jkrZt  }|| | jkrItd| | jrY|d dkrYtdd	dd
 qtd| d|j d|j  | jjj|dI dH S )z5Wait for and return the results of an extraction job.r   TNr*   r   z#Timeout while extracting the file: r9   . )endflushzFailure in job: z
, status: z	, error: )timeperf_counterrZ   sleepr2   r<   ru   get_jobstatusr   SUCCESSget_run_by_job_idPENDINGr3   	Exceptionr@   printwarningswarnerror)rG   r   starttriesjobr   rH   rH   rI   _wait_for_job_result   s4   z$ExtractionAgent._wait_for_job_resultc                 C   s,   |  | jjj| jt| j| jdd| _dS )zPersist the extraction agent's schema and config to the database.

        Returns:
            ExtractionAgent: The updated extraction agent
        )rj   ry   )extraction_agent_idrn   N)	rb   r<   ru   update_extraction_agentre   r   rj   ry   r=   rf   rH   rH   rI   save   s   
zExtractionAgent.saver   extract_settingsc                    s2  t |ts|g}d}nd}fdd|D }t  t|jdjdI d H }W d    n1 s2w   Y  dtdtf fd	d
fdd|D }t  t|jdjdI d H }W d    n1 sgw   Y  jrt	||D ]\}}	t |t
tfrt
|nd}
td|
 d|	j  qt|r|d S |S )NTFc                       g | ]}  |qS rH   r   .0filerf   rH   rI   
<listcomp>       z:ExtractionAgent._queue_extraction_test.<locals>.<listcomp>Uploading filesworkersdescr5   r   rL   c                    s@   j jjtj| jjjd dI d H }|jI d H S )Nr   file_iddata_schema_overrideconfig_override)
job_creater   )r<   ru   run_job_test_userr   re   rj   ry   r   )r   
job_queued)r   rG   rH   rI   run_job   s   	z7ExtractionAgent._queue_extraction_test.<locals>.run_jobc                    s   g | ]} |qS rH   rH   r   )r   rH   rI   r      s    zRunning extraction jobs<bytes/buffer> Queued file extraction for file  under job_id r   )ro   listr   r!   r4   r5   r   r   r@   zipr~   r   r   re   )rG   r   r   single_fileupload_tasksuploaded_files	job_tasksextract_jobsr   r   	file_reprrH   )r   r   rG   rI   _queue_extraction_test   s@   
z&ExtractionAgent._queue_extraction_testc           
         s  	 t |ts|g}d}nd} fdd|D }t  t| jd jdI dH }W d   n1 s3w   Y   fdd|D }t  t| jd	 jdI dH }W d   n1 s\w   Y   jrt||D ]\}}t |tt	frxt|nd
}	t
d|	 d|j  qi|r|d S |S )z
        Queue multiple files for extraction.

        Args:
            files (Union[FileInput, List[FileInput]]): The files to extract

        Returns:
            Union[ExtractJob, List[ExtractJob]]: The queued extraction jobs
        TFc                    r   rH   r   r   rf   rH   rI   r     r   z4ExtractionAgent.queue_extraction.<locals>.<listcomp>r   r   Nc              	      s0   g | ]} j jjt j|j j jd dqS )r   rm   )r<   ru   r   r   re   rj   ry   r   rf   rH   rI   r   $  s    	zCreating extraction jobsr   r   r   r   )ro   r   r   r!   r4   r5   r@   r   r~   r   r   re   )
rG   r   r   r   r   r   r   r   r   r   rH   rf   rI   queue_extraction  sD   

	z ExtractionAgent.queue_extractionc                    s   t |ts|g}d}nd} |I dH } fdd|D }t  t| jd jdI dH }W d   n1 s:w   Y  |rE|d S |S )	a  Asynchronously extract data from one or more files using this agent.

        Args:
            files (Union[FileInput, List[FileInput]]): The files to extract

        Returns:
            Union[ExtractRun, List[ExtractRun]]: The extraction results
        TFNc                    s   g | ]}  |jqS rH   )r   re   )r   r   rf   rH   rI   r   V  s    z,ExtractionAgent.aextract.<locals>.<listcomp>zExtracting filesr   r   )ro   r   r   r   r!   r4   r5   )rG   r   r   jobsresult_tasksresultsrH   rf   rI   aextractB  s    
zExtractionAgent.aextractc                 C   s   |  | |S )a  Synchronously extract data from one or more files using this agent.

        Args:
            files (Union[FileInput, List[FileInput]]): The files to extract

        Returns:
            Union[ExtractRun, List[ExtractRun]]: The extraction results
        )rb   r   )rG   r   rH   rH   rI   extracta  s   zExtractionAgent.extractc                 C      |  | jjj|dS )z
        Get the extraction job for a given job_id.

        Args:
            job_id (str): The job_id to get the extraction job for

        Returns:
            ExtractJob: The extraction job
        r   )rb   r<   ru   r   rG   r   rH   rH   rI   get_extraction_jobn  s   
z"ExtractionAgent.get_extraction_jobc                 C   r   )z
        Get the extraction run for a given job_id.

        Args:
            job_id (str): The job_id to get the extraction run for

        Returns:
            ExtractRun: The extraction run
        r   )rb   r<   ru   r   r   rH   rH   rI   get_extraction_run_for_jobz  s
   
z*ExtractionAgent.get_extraction_run_for_jobrun_idc                 C      |  | jjj|d dS )zxDelete an extraction run by ID.

        Args:
            run_id (str): The ID of the extraction run to delete
        )r   N)rb   r<   ru   delete_extraction_run)rG   r   rH   rH   rI   r     s   z%ExtractionAgent.delete_extraction_runr   d   pagelimitc                 C   s    |  | jjj| j|| |dS )zList extraction runs for the extraction agent.

        Returns:
            PaginatedExtractRunsResponse: Paginated list of extraction runs
        )r   skipr   )rb   r<   ru   list_extract_runsre   )rG   r   r   rH   rH   rI   list_extraction_runs  s   z$ExtractionAgent.list_extraction_runsc                 C   s   d| j  d| j dS )NzExtractionAgent(id=z, name=))re   rh   rf   rH   rH   rI   __repr__  s   zExtractionAgent.__repr__)	NNr*   r+   r,   TFTr-   )rL   N)r   r   )-__name__
__module____qualname____doc__r   CloudExtractAgentr   r~   intboolfloatrJ   r
   r   r&   rb   propertyre   rh   rp   rj   setterSchemaInputr   ry   	FileInputr   r   r   r   r   r	   r   r   r   r   r   r   r   r   r   r   r   r   r   rH   rH   rH   rI   r)   .   s    	



4
;


r)   c                       s  e Zd ZU dZeddZeed< eddZeed< eddd	Z	e
ed
< eddd	Ze
ed< edddddZe
ed< eddd	Zeed< eddd	Zeed< eddd	Zee ed< eddd	Zee ed< e Zeed< e Zeed < e Zee ed!< e Zee ed"< 	#	#					#	#			d9dee dee d
e
de
de
ded$ee d%ee dee dee def fd&d'Zd(eeeef d)efd*d+Z	#d:d,ed-e d.ee! d)e"fd/d0Z#	#	#d;d,ee d1ee d)e"fd2d3Z$d)e%e" fd4d5Z&d6ed)d#fd7d8Z'  Z(S )<LlamaExtractz:Factory class for creating and managing extraction agents.z%The API key for the LlamaExtract API.)descriptionapi_keyz%The base URL of the LlamaExtract API.base_urlr*   z;The interval in seconds to check if the extraction is done.)defaultr   r2   r+   zDThe maximum timeout in seconds to wait for the extraction to finish.r3   r,   r   r9   zAThe number of workers to use sending API requests for extraction.)r   gtltr   r4   Tz-Show progress when extracting multiple files.r5   Fz*Show verbose output when extracting files.r6   zSimple SSL verification option.r7   r-   zTimeout for the httpx client.r8   _async_clientrF   r>   r?   Nr0   r1   c                    s  |st dd }|d u rtd|st dd pt}t j|||||||	|
|d	 tj|	|
d| _|	| _	|
| _
t| j| j| jd| _ttdt  pJdd	 d
| _|szt dd }|sztd | | jj }dd |D }|sutd|d j}|| _|| _d S )NLLAMA_CLOUD_API_KEYzThe API key is required.LLAMA_CLOUD_BASE_URL)	r   r   r2   r3   r4   r5   r7   r8   r6   rP   )tokenr   rS   r9   r*   r,   r:   LLAMA_CLOUD_PROJECT_IDz1No project_id provided, fetching default project.c                 S   s   g | ]}|j r|qS rH   )
is_default)r   prH   rH   rI   r     r   z)LlamaExtract.__init__.<locals>.<listcomp>z6No default project found. Please provide a project_id.r   )rD   getenvrt   r$   superrJ   rT   rU   _httpx_clientr7   r8   r   r   r   r   r%   rC   rE   rF   r   rb   projectslist_projectsre   r>   r?   )rG   r   r   r2   r3   r4   r5   r0   r1   r7   r8   r6   r   default_project	__class__rH   rI   rJ     sV   


zLlamaExtract.__init__rK   rL   c                    rM   )rN   rL   c                     rO   )NrL   c               	      s   j d us
J dtjjjd4 I d H $} | jj_z I d H W j jj_W  d   I d H  S j jj_w 1 I d H sAw   Y  d S )Nz"httpx_client should be initializedrP   )r   rT   rU   r7   r8   r   rR   rS   )r.   rW   rH   rI   rX     s$   


zCLlamaExtract._run_in_thread.<locals>.run_coro.<locals>.wrapped_cororY   r\   rW   rH   rI   r]     s   z-LlamaExtract._run_in_thread.<locals>.run_coror^   ra   rH   rW   rI   rb     s   zLlamaExtract._run_in_threadrh   rj   ry   c                 C   s   t |tr|}nt|tr| }ntd| | jjj	| j
| jt|||p&tdd}t| j|| j
| j| j| j| j| j| jd	S )a]  Create a new extraction agent.

        Args:
            name (str): The name of the extraction agent
            data_schema (SchemaInput): The data schema for the extraction agent
            config (Optional[ExtractConfig]): The extraction config for the agent

        Returns:
            ExtractionAgent: The created extraction agent
        rl   )rh   rj   ry   )r0   r1   rn   	r.   r/   r0   r1   r2   r3   r4   r5   r6   )ro   rp   rr   r   rs   rt   rb   r   ru   create_extraction_agentr>   r?   r   DEFAULT_EXTRACT_CONFIGr)   r2   r3   r4   r5   r6   )rG   rh   rj   ry   r/   rH   rH   rI   create_agent'  s:   


zLlamaExtract.create_agentre   c                 C   s   |dur|durt d |r| | jjj|d}n|r+| | jjj|| jd}ntdt	| j|| j| j
| j| j| j| j| j| j| jdS )a  Get extraction agents by name or extraction agent ID.

        Args:
            name (Optional[str]): Filter by name
            extraction_agent_id (Optional[str]): Filter by extraction agent ID

        Returns:
            ExtractionAgent: The extraction agent
        NzJBoth name and extraction_agent_id are provided. Using extraction_agent_id.r   )rh   r0   z4Either name or extraction_agent_id must be provided.)r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   )r   r   rb   r   ru   get_extraction_agentget_extraction_agent_by_namer>   rt   r)   r?   r2   r3   r4   r5   r6   r7   r8   )rG   rh   re   r/   rH   rH   rI   	get_agentY  s>   zLlamaExtract.get_agentc                    s*      jjj jd} fdd|D S )z%List all available extraction agents.)r0   c                    s6   g | ]}t  j| j j j j j j jd 	qS )r   )	r)   r   r>   r?   r2   r3   r4   r5   r6   )r   r/   rf   rH   rI   r     s    z,LlamaExtract.list_agents.<locals>.<listcomp>)rb   r   ru   list_extraction_agentsr>   )rG   agentsrH   rf   rI   list_agents  s   
zLlamaExtract.list_agentsagent_idc                 C   r   )zzDelete an extraction agent by ID.

        Args:
            agent_id (str): ID of the extraction agent to delete
        r   N)rb   r   ru   delete_extraction_agent)rG   r   rH   rH   rI   delete_agent  s
   zLlamaExtract.delete_agent)NNr*   r+   r,   TNNTr-   Frd   )NN))r   r   r   r   r"   r   r~   __annotations__r   r2   r   r3   r4   r5   r   r6   r7   r   r8   r   r#   r   r   rF   r%   r>   r?   rJ   r
   r   r&   rb   r   r   r)   r   r   r   r   r  __classcell__rH   rH   r   rI   r     s   
 	
? 
4
2r   __main__)load_dotenvtestsdataz
test-agent)rh   objectrq   string)titlesummary)rq   
propertiesslidezconocophilips.pdf)PrZ   rD   r   ior   r   r   pathlibr   typingr   r   r   r	   r
   r   r   r   rT   pydanticr   llama_cloudr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   llama_cloud.clientr   "llama_cloud_services.extract.utilsr   r   llama_index.core.schemar    llama_index.core.async_utilsr!    llama_index.core.bridge.pydanticr"   r#   llama_index.core.constantsr$   concurrent.futuresr%   r&   r~   r}   r   r   PER_DOCACCURATEr   r)   r   r   dotenvr  __file__parentdata_dir	extractorr   r/   r   r   r   r   r  re   r   rH   rH   rH   rI   <module>   sf    $D  |  
	