a
    (gR*                     @   s.   d dl Z d dlmZmZmZ G dd dZdS )    N)DictListOptionalc                   @   s   e Zd ZeedddZedddZedddZeed	d
dZeed	ddZeed	ddZ	eed	ddZ
eed	ddZeed	ddZeed	ddZeedddZeed	ddZd%eeeddd Zeed!d"d#Zd$S )&SECReportCleaner)content	file_pathc                 C   s>   |rt | dkrtd|| _|| _| || _i | _d S )Nr   z&Otrzymano pusty tekst do przetworzenia)lenstrip
ValueErrorr   r   _detect_report_typereport_typesections)selfr   r    r   4/var/www/html/inwestownie/raporty/src/sec_cleaner.py__init__   s    zSECReportCleaner.__init__)returnc                 C   sT   t d| j  t dt| j  | jdkr6|  S | jdkrH|  S |  S dS )u   Główna metoda czyszczącazPrzetwarzanie raportu typu: u   Początkowa długość tekstu: 10-Q14AN)printr   r   r   Z
_clean_10q
_clean_14aZ_clean_generic)r   r   r   r   clean   s    

zSECReportCleaner.cleanc                 C   s   | j }| |}tdt|  | |}tdt|  tjdd|tjd}tdt|  | |}tdt|  | 	|}tdt|  | 
|}td	t|  |S )
u7   Metoda czyszcząca dla raportów 14A (proxy statements)u   Po usunięciu nagłówka SEC: u%   Po wyciągnięciu głównej treści: z<style.*?</style> flagsu   Po usunięciu CSS: u   Po wyczyszczeniu tagów HTML: zPo formatowaniu sekcji proxy: u   Po końcowym czyszczeniu: )r   _remove_sec_headerr   r   _extract_main_contentresubDOTALL_clean_html_tags_format_proxy_sections_clean_whitespace)r   cleaned_contentr   r   r   r      s    




zSECReportCleaner._clean_14a)textr   c           
      C   s   dddddddd}g }|  D ]@\}}tt||tj}|D ]}|| d	| d
f q@q |jdd |D ]$\}}	|d| |	 ||d  }qr|S )z*Formatuje sekcje dokumentu proxy statementzGENERAL\s+INFORMATIONzCORPORATE\s+GOVERNANCEzEXECUTIVE\s+COMPENSATIONzSECURITY\s+OWNERSHIPzAUDIT\s+COMMITTEEzPROPOSAL\s+\d+zSTOCKHOLDER\s+PROPOSALS)zGENERAL INFORMATIONzCORPORATE GOVERNANCEzEXECUTIVE COMPENSATIONzSECURITY OWNERSHIPzAUDIT COMMITTEEZPROPOSALzSTOCKHOLDER PROPOSALS

===  ===

TreverseNitemslistr   finditer
IGNORECASEappendstartsort)
r   r$   Zproxy_sectionsr   section_namepatternmatchesmatchposheaderr   r   r   r!   7   s"    
z'SECReportCleaner._format_proxy_sectionsc                 C   sP   d|v rLd|v rL|| d| d }| |}|dd }|d | }|S )u2   Usuwa nagłówek SEC zachowując ważne informacjez<SEC-HEADER>z</SEC-HEADER>   

)find_extract_important_header_infosplit)r   r$   r6   Zimportant_infor   r   r   r   O   s    
z#SECReportCleaner._remove_sec_headerc                 C   s   d|vrd|vr|S z2| d}|d}||k rD||d | W S W n tyX   Y n0 td|tjtjB }|r~|dS |S )u$   Wyciąga główną treść dokumentuz<TEXT>z</TEXT>   z<TEXT>(.*)</TEXT>r7   )indexrindexr
   r   searchr   r-   group)r   r$   Z	start_idxZend_idxZmain_content_matchr   r   r   r   Z   s    


z&SECReportCleaner._extract_main_contentc                 C   s|   t jdd|t jt jB d}t jdd|t jd}t jdd|t jd}t jdd	|t jd}t jd
d	|t jd}t dd|}|S )u2   Czyści tagi HTML zachowując strukturę dokumentuz
<TABLE.*?>z

[TABELA]
r   z</TABLE>z
[KONIEC TABELI]

z<p.*?>r8   z<br.*?>
z<div.*?>z<[^>]+> )r   r   r-   r   r   r$   r   r   r   r    p   s    z!SECReportCleaner._clean_html_tagsc                 C   s    dd }t jd||t jd}|S )u   Czyści i formatuje tabelec                 S   s   |  d}td|tjtjB }|s(|S g }|D ]<}td|tjtjB }|r0dd |D }|d| q0|rdddd	 |d d
D  d }d| dd| d| d| d S dS )Nr   z<TR.*?>(.*?)</TR>z<T[HD].*?>(.*?)</T[HD]>c                 S   s   g | ]}|  d dqS )rA   rB   )r	   replace.0Zcellr   r   r   
<listcomp>       zHSECReportCleaner._clean_tables.<locals>.format_table.<locals>.<listcomp>z | +c                 s   s   | ]}d t | V  qdS )-N)r   rE   r   r   r   	<genexpr>   rH   zGSECReportCleaner._clean_tables.<locals>.format_table.<locals>.<genexpr>|rA   r   )r@   r   findallr   r-   r.   joinr;   )r4   Ztable_contentZrowsZformatted_rowsZrowZcellsZformatted_cells	separatorr   r   r   format_table   s    
&*z4SECReportCleaner._clean_tables.<locals>.format_tablez<TABLE.*?</TABLE>r   )r   r   r   )r   r$   rP   r   r   r   _clean_tables   s    zSECReportCleaner._clean_tablesc           
      C   s   dddddddd}g }|  D ]@\}}tt||tj}|D ]}|| d	| d
f q@q |jdd |D ]$\}}	|d| |	 ||d  }qr|S )zFormatuje sekcje dokumentuzItem\s*1\.\s*BusinesszItem\s*1A\.\s*Risk\s*Factorsz#Item\s*7\.\s*Management.*Discussionz#Item\s*8\.\s*Financial\s*StatementszItem\s*6\.\s*Exhibitsz Item\s*5\.\s*Other\s*Informationz Item\s*3\.\s*Legal\s*Proceedings)ZBUSINESSzRISK FACTORSzMD&AzFINANCIAL STATEMENTSZEXHIBITSzOTHER INFORMATIONzLEGAL PROCEEDINGSr%   r&   Tr'   Nr)   )
r   r$   Zsection_patternsr   r1   r2   r3   r4   r5   r6   r   r   r   _format_sections   s"    z!SECReportCleaner._format_sectionsc                 C   s<   t dd|}t dd|}ddd | D }| S )u   Czyści nadmiarowe białe znakiz +rB   z\n{3,}r8   rA   c                 s   s   | ]}|  V  qd S N)r	   )rF   liner   r   r   rK      rH   z5SECReportCleaner._clean_whitespace.<locals>.<genexpr>)r   r   rN   
splitlinesr	   rC   r   r   r   r"      s    z"SECReportCleaner._clean_whitespace)r6   r   c                 C   sT   g d}g }|D ]8}t | d|}|r|| d|d   qd|S )u+   Wyciąga ważne informacje z nagłówka SEC)zCOMPANY CONFORMED NAMEzFILED AS OF DATEzCONFORMED SUBMISSION TYPEzFISCAL YEAR ENDz:\s*(.+)z: r7   rA   )r   r?   r.   r@   r	   rN   )r   r6   Zimportant_fieldsZ
info_linesZfieldr4   r   r   r   r:      s     z/SECReportCleaner._extract_important_header_infoc                    s^   d|v rt dd|}g }| D ]0 t fdddD s"t d s"|  q"d|S )	u   Czyści dane XBRL i metadanez<XBRL>z(?s)<XBRL>.*?</XBRL>r   c                 3   s   | ]}|   v V  qd S rS   )lower)rF   prefixrT   r   r   rK      rH   z4SECReportCleaner._clean_xbrl_data.<locals>.<genexpr>)zxbrl:zus-gaap:ziso4217:zefr:zsrt:z
^\d{10}\s+rA   )r   r   rU   anyr4   r.   rN   )r   r$   Zcleaned_linesr   rX   r   _clean_xbrl_data   s    z!SECReportCleaner._clean_xbrl_datax   )r$   
max_lengthr   c           	      C   s   g }|  D ]}t||kr| }g }d}|D ]R}|t| d |krd|| |t|d 7 }q0|d| |g}t|}q0|r|d| q|| qd|S )u7   Formatuje długie linie tekstu zachowując czytelnośćr   r7   rB   rA   )rU   r   r;   r.   rN   )	r   r$   r\   Zformatted_linesrT   wordsZcurrent_lineZcurrent_lengthZwordr   r   r   _format_long_lines   s"    

z#SECReportCleaner._format_long_lines)r   r   c                 C   sJ   d|v rdS d|v rdS d|v r$dS d|v s4d|v r8dS t d| dS )z,Wykrywa typ raportu na podstawie nazwy plikur   z8-Kz10-KzDEF 14Ar   zNieznany typ raportu: N)r
   )r   r   r   r   r   r      s    z$SECReportCleaner._detect_report_typeN)r[   )__name__
__module____qualname__strr   r   r   r!   r   r   r    rQ   rR   r"   r:   rZ   intr^   r   r   r   r   r   r      s   
r   )r   typingr   r   r   r   r   r   r   r   <module>   s   