U
    @6^                     @   s8   d dl Z d dlZd dlmZmZmZ G dd deZdS )    N)
SpiderFootSpiderFootPluginSpiderFootEventc                   @   sd   e Zd ZdZddiZddiZdZe ZdZ	e
 fddZd	d
 Zdd Zdd Zdd Zdd ZdS )sfp_commoncrawlzeCommonCrawl:Footprint,Passive:Crawling and Scanning::Searches for URLs found through CommonCrawl.org.indexes   zYNumber of most recent indexes to attempt, because results tend to be occasionally patchy.NFc                 C   sB   || _ |  | _t | _d| _t| D ]}|| | j|< q*d S )NF)sftempStorageresultslist	indexBase
errorStatekeysopts)selfsfcuserOptsopt r   :/var/www/spiderfoot.crq.systems/modules/sfp_commoncrawl.pysetup&   s    
zsfp_commoncrawl.setupc                 C   s   t  }| jD ]}d| d | d }| jj|ddd}|d dkr\| jd	d
 d| _ d S |d s~| jd	d
 d| _ d S ||d  q|S )Nzhttps://index.commoncrawl.org/z-index?url=z/*&output=json<   r   timeout	useragentcode400401402403404z0CommonCrawl search doesn't seem to be available.FTcontent)r   r   r   fetchUrlerrorr   append)r   targetretindexurlresr   r   r   search/   s,    

zsfp_commoncrawl.searchc                 C   sB  d}| j j|ddd}|d dkr<| j dd d	| _t S |d
 s^| j dd d	| _t S td|d
 }d}t }|D ] }|dddd}d	||< q|t	t|
 d	dd| jd  }t|| jd k r| j dd d	| _t S t }	|D ]2}
|	dt|
dd  d t|
dd   q| j dt|	  |	S )NzDhttps://commoncrawl.s3.amazonaws.com/cc-index/collections/index.htmlr   r   r   r   r   z:CommonCrawl index collection doesn't seem to be available.FTr"   z.*(CC-MAIN-\d+-\d+).*r   zCC-MAIN- -)reverser   z,Not able to find latest CommonCrawl indexes.   r   zCommonCrawl indexes: )r   r#   r$   r   r   refindalldictreplacesortedr   r   lenr%   strdebug)r   r)   r*   r   ZhighestZ	indexlistmmsZ
topindexesZretindexir   r   r   getLatestIndexesE   s8    

"0z sfp_commoncrawl.getLatestIndexesc                 C   s   dgS )NINTERNET_NAMEr   r   r   r   r   watchedEventsi   s    zsfp_commoncrawl.watchedEventsc                 C   s   dgS )NLINKED_URL_INTERNALr   r=   r   r   r   producedEventso   s    zsfp_commoncrawl.producedEventsc                 C   s  |j }|j}|j}| jd| d |  | jr4d S || jkrBd S d| j|< t| jdkrd| 	 | _| js|| j
dd d S t| jdkr| j
dd d S | |}|s| j
dd d S t }|D ]}z|dD ]v}|  r W  d S t|d	k rqt|}	d
|	krq|	d
 |kr q||	d
  td|	d
 | j|}
| |
 qW q tk
r } z"| j
dt| d W Y  d S d }~X Y qX qd S )NzReceived event, z, from Tr   z"Unable to fetch CommonCrawl index.Fz*Unable to obtain content from CommonCrawl.
   r)   r?   z%Malformed JSON from CommonCrawl.org: )	eventTypemoduledatar   r7   r   r
   r5   r   r;   r$   r+   r   splitcheckForStopjsonloadsr%   r   __name__notifyListenersBaseExceptionr6   )r   event	eventNamesrcModuleName	eventDatarE   sentr"   linelinkevter   r   r   handleEvents   sV    







 zsfp_commoncrawl.handleEvent)rJ   
__module____qualname____doc__r   optdescsr
   r   r   r   r2   r   r+   r;   r>   r@   rV   r   r   r   r   r      s      	$r   )r0   rH   sflibr   r   r   r   r   r   r   r   <module>   s   