Spaces:

DeL-TaiseiOzaki
/

Repository_Scaner

Sleeping

App Files Files Community

DeL-TaiseiOzaki commited on Oct 30, 2024

Commit

560aacd

1 Parent(s): fcac1a1

ss

Browse files

Files changed (5) hide show

app.py +126 -143
config/llm_settings.py +4 -9
config/settings.py +2 -2
core/file_scanner.py +40 -36
services/llm_service.py +10 -46

app.py CHANGED Viewed

@@ -1,177 +1,89 @@
 import streamlit as st
 import tempfile
 import git
-from core.file_scanner import FileScanner
 from pathlib import Path
-from datetime import datetime
 from services.llm_service import LLMService
-from core.file_scanner import FileInfo
 from typing import List
-# ページ設定
 st.set_page_config(
-    page_title="Repository Code Analysis",
-    page_icon="🔍",
-    layout="wide"
 )
-# ダークテーマの設定
 st.markdown("""
 <style>
-    .stApp {
-        background-color: #0e1117;
-        color: #ffffff;
-    }
-    .chat-message {
-        padding: 1rem;
-        margin: 1rem 0;
-        border-radius: 0.5rem;
-    }
-    .assistant-message {
-        background-color: #1e2329;
-        color: #ffffff;
-    }
-    .stButton button {
-        background-color: #2ea44f;
-        color: #ffffff;
-    }
-    .stTextArea textarea {
-        background-color: #1e2329;
-        color: #ffffff;
-    }
 </style>
 """, unsafe_allow_html=True)
 def clone_repository(repo_url: str) -> Path:
-    """リポジトリをクローンして一時ディレクトリに保存"""
-    temp_dir = Path(tempfile.mkdtemp())
-    git.Repo.clone_from(repo_url, temp_dir)
-    return temp_dir
-def create_download_content(files: List[FileInfo]) -> str:
-    content = "# スキャン結果\n\n"
-    for file in files:
-        content += f"## {file.path}\n"
-        content += f"サイズ: {file.formatted_size}\n"
-        content += f"エンコーディング: {file.encoding or '不明'}\n\n"
-        if file.content:
-            content += f"```{file.extension[1:] if file.extension else ''}\n"
-            content += file.content
-            content += "\n```\n\n"
-    return content
-# セッション状態の初期化
 if 'repo_content' not in st.session_state:
-    st.session_state.repo_content = None
 if 'temp_dir' not in st.session_state:
-    st.session_state.temp_dir = None
 if 'llm_service' not in st.session_state:
-    try:
-        st.session_state.llm_service = LLMService()
-    except ValueError as e:
-        st.error(str(e))
-        st.stop()
-# メインのUIレイアウト
 st.title("🔍 リポジトリ解析・質問システム")
-# サイドバーでモデル選択
-available_models = st.session_state.llm_service.settings.get_available_models()
-if len(available_models) > 1:
-    selected_model = st.sidebar.selectbox(
-        "使用するモデル",
-        available_models,
-        index=available_models.index(st.session_state.llm_service.current_model)
-    )
-    st.session_state.llm_service.switch_model(selected_model)
-# URLの入力
-repo_url = st.text_input(
-    "GitHubリポジトリのURLを入力",
-    placeholder="https://github.com/username/repository.git"
-)
-# スキャン実行ボタン
-if st.button("スキャン開始", disabled=not repo_url):
-    try:
-        with st.spinner('リポジトリをクローン中...'):
-            temp_dir = clone_repository(repo_url)
-            st.session_state.temp_dir = temp_dir
-        with st.spinner('ファイルをスキャン中...'):
-            scanner = FileScanner(temp_dir)
-            files = scanner.scan_files()  # List[FileInfo] を取得
-            st.session_state.repo_content = LLMService.format_code_content(files)
-        st.success(f"スキャン完了: {len(files)}個のファイルを検出")
-        # 新しいスキャン時に会話履歴をクリア
-        st.session_state.llm_service.clear_history()
-    except Exception as e:
-        st.error(f"エラーが発生しました: {str(e)}")
-# スキャン完了後の質問セクション
-if st.session_state.repo_content:
-    st.divider()
-    st.subheader("💭 コードについて質問する")
-    # スキャン結果のダウンロードボタン
-    scan_result = create_download_content(files)  # filesはスキャン結果
-    st.download_button(
-        label="スキャン結果をダウンロード",
-        data=scan_result,
-        file_name=f"scan_result_{datetime.now().strftime('%Y%m%d_%H%M%S')}.md",
-        mime="text/markdown"
-    )
-    # 会話履歴の表示（アシスタントの回答のみ）
-    for message in st.session_state.llm_service.conversation_history:
-        if message.role == "assistant":  # アシスタントの回答のみを表示
-            st.markdown(f'<div class="chat-message assistant-message">{message.content}</div>',
-                       unsafe_allow_html=True)
-    query = st.text_area(
-        "質問を入力してください",
-        placeholder="例: このコードの主な機能は何ですか？"
-    )
-    col1, col2 = st.columns([1, 5])
-    with col1:
-        if st.button("履歴クリア"):
-            st.session_state.llm_service.clear_history()
-            st.rerun()
-    with col2:
-        if st.button("質問する", disabled=not query):
-            with st.spinner('回答を生成中...'):
-                response, error = st.session_state.llm_service.get_response(
-                    st.session_state.repo_content,
-                    query
-                )
-                if error:
-                    st.error(error)
-                else:
-                    st.rerun()  # 会話履歴を更新するために再表示
-# セッション終了時のクリーンアップ
-if st.session_state.temp_dir and Path(st.session_state.temp_dir).exists():
-    try:
-        import shutil
-        shutil.rmtree(st.session_state.temp_dir)
-    except:
-        pass
-# サイドバー情報
-with st.sidebar:
     st.subheader("📌 使い方")
     st.markdown("""
     1. GitHubリポジトリのURLを入力
     2. スキャンを実行
     3. コードについて質問（最大5ターンの会話が可能）
     """)
     st.subheader("🔍 スキャン対象")
     st.markdown("""
     - Python (.py)
@@ -179,4 +91,75 @@ with st.sidebar:
     - Java (.java)
     - C/C++ (.c, .h, .cpp, .hpp)
     - その他の主要なプログラミング言語
-    """)

 import streamlit as st
 import tempfile
 import git
 from pathlib import Path
+from datetime import datetime
 from services.llm_service import LLMService
+from core.file_scanner import FileScanner, FileInfo
 from typing import List
 st.set_page_config(
+   page_title="Repository Code Analysis",
+   page_icon="🔍",
+   layout="wide"
 )
 st.markdown("""
 <style>
+   .stApp {
+       background-color: #0e1117;
+       color: #ffffff;
+   }
+   .chat-message {
+       padding: 1rem;
+       margin: 1rem 0;
+       border-radius: 0.5rem;
+   }
+   .assistant-message {
+       background-color: #1e2329;
+       color: #ffffff;
+   }
+   .stButton button {
+       background-color: #2ea44f;
+       color: #ffffff;
+   }
+   .stTextArea textarea {
+       background-color: #1e2329;
+       color: #ffffff;
+   }
 </style>
 """, unsafe_allow_html=True)
+def create_download_content(files: List[FileInfo]) -> str:
+   content = "# スキャン結果\n\n"
+   for file in files:
+       content += f"## {file.path}\n"
+       content += f"サイズ: {file.formatted_size}\n"
+       content += f"エンコーディング: {file.encoding or '不明'}\n\n"
+       if file.content:
+           content += f"```{file.extension[1:] if file.extension else ''}\n"
+           content += file.content
+           content += "\n```\n\n"
+   return content
 def clone_repository(repo_url: str) -> Path:
+   temp_dir = Path(tempfile.mkdtemp())
+   git.Repo.clone_from(repo_url, temp_dir)
+   return temp_dir
 if 'repo_content' not in st.session_state:
+   st.session_state.repo_content = None
 if 'temp_dir' not in st.session_state:
+   st.session_state.temp_dir = None
 if 'llm_service' not in st.session_state:
+   try:
+       st.session_state.llm_service = LLMService()
+   except ValueError as e:
+       st.error(str(e))
+       st.stop()
 st.title("🔍 リポジトリ解析・質問システム")
+with st.sidebar:
+    if not st.session_state.llm_service.settings.anthropic_api_key:
+        st.error("Anthropic API key is required")
+        st.stop()
+    st.write("Using Claude model")
+    st.divider()
     st.subheader("📌 使い方")
     st.markdown("""
     1. GitHubリポジトリのURLを入力
     2. スキャンを実行
     3. コードについて質問（最大5ターンの会話が可能）
     """)
     st.subheader("🔍 スキャン対象")
     st.markdown("""
     - Python (.py)
     - Java (.java)
     - C/C++ (.c, .h, .cpp, .hpp)
     - その他の主要なプログラミング言語
+    """)
+repo_url = st.text_input(
+   "GitHubリポジトリのURLを入力",
+   placeholder="https://github.com/username/repository.git"
+)
+if st.button("スキャン開始", disabled=not repo_url):
+   try:
+       with st.spinner('リポジトリをクローン中...'):
+           temp_dir = clone_repository(repo_url)
+           st.session_state.temp_dir = temp_dir
+       with st.spinner('ファイルをスキャン中...'):
+           scanner = FileScanner(temp_dir)
+           files = scanner.scan_files()
+           st.session_state.repo_content = LLMService.format_code_content(files)
+       st.success(f"スキャン完了: {len(files)}個のファイルを検出")
+       scan_result = create_download_content(files)
+       st.download_button(
+           label="スキャン結果をダウンロード",
+           data=scan_result,
+           file_name=f"scan_result_{datetime.now().strftime('%Y%m%d_%H%M%S')}.md",
+           mime="text/markdown"
+       )
+       st.session_state.llm_service.clear_history()
+   except Exception as e:
+       st.error(f"エラーが発生しました: {str(e)}")
+if st.session_state.repo_content:
+   st.divider()
+   st.subheader("💭 コードについて質問する")
+   for message in st.session_state.llm_service.conversation_history:
+       if message.role == "assistant":
+           st.markdown(f'<div class="chat-message assistant-message">{message.content}</div>',
+                      unsafe_allow_html=True)
+   query = st.text_area(
+       "質問を入力してください",
+       placeholder="例: このコードの主な機能は何ですか？"
+   )
+   col1, col2 = st.columns([1, 5])
+   with col1:
+       if st.button("履歴クリア"):
+           st.session_state.llm_service.clear_history()
+           st.rerun()
+   with col2:
+       if st.button("質問する", disabled=not query):
+           with st.spinner('回答を生成中...'):
+               response, error = st.session_state.llm_service.get_response(
+                   st.session_state.repo_content,
+                   query
+               )
+               if error:
+                   st.error(error)
+               else:
+                   st.rerun()
+if st.session_state.temp_dir and Path(st.session_state.temp_dir).exists():
+   try:
+       import shutil
+       shutil.rmtree(st.session_state.temp_dir)
+   except:
+       pass

config/llm_settings.py CHANGED Viewed

@@ -5,15 +5,10 @@ class LLMSettings:
     def __init__(self):
         load_dotenv()
         self.anthropic_api_key = os.getenv("ANTHROPIC_API_KEY")
-        self.openai_api_key = os.getenv("OPENAI_API_KEY")
         self.default_llm = "claude"
     def get_available_models(self):
-        available_models = []
-        if self.anthropic_api_key:
-            available_models.append("claude")
-        if self.openai_api_key:
-            available_models.append("openai")
-        if not available_models:
-            raise ValueError("APIキーが設定されていません")
-        return available_models

     def __init__(self):
         load_dotenv()
         self.anthropic_api_key = os.getenv("ANTHROPIC_API_KEY")
         self.default_llm = "claude"
+        if not self.anthropic_api_key:
+            raise ValueError("ANTHROPIC_API_KEY is required")
     def get_available_models(self):
+        return ["claude"]

config/settings.py CHANGED Viewed

@@ -14,5 +14,5 @@ class Settings:
         return cls.DEFAULT_OUTPUT_DIR / f"repo_clone_{timestamp}"
     @classmethod
-    def get_output_file(cls, timestamp: str) -> Path:
-        return cls.DEFAULT_OUTPUT_DIR / f"scan_result_{timestamp}.txt"

         return cls.DEFAULT_OUTPUT_DIR / f"repo_clone_{timestamp}"
     @classmethod
+    def get_log_file(cls, timestamp: str) -> Path:
+        return cls.DEFAULT_OUTPUT_DIR / f"scan_log_{timestamp}.txt"

core/file_scanner.py CHANGED Viewed

@@ -1,60 +1,64 @@
 from pathlib import Path
 from typing import List, Dict, Optional
 from dataclasses import dataclass
 @dataclass
 class FileInfo:
     path: Path
     content: Optional[str] = None
 class FileScanner:
-    # スキャン対象の拡張子
-    TARGET_EXTENSIONS = {
-        '.py', '.js', '.java', '.cpp', '.hpp', '.c', '.h',
-        '.go', '.rs', '.php', '.rb', '.ts', '.scala', '.kt',
-        '.cs', '.swift', '.m', '.sh', '.pl', '.r'
-    }
-    # スキャン対象から除外するディレクトリ
-    EXCLUDED_DIRS = {
-        '.git', '__pycache__', 'node_modules', 'venv', '.env',
-        'build', 'dist', 'target', 'bin', 'obj'
-    }
     def __init__(self, base_dir: Path):
         self.base_dir = base_dir
-    def _should_scan_file(self, path: Path) -> bool:
-        if any(excluded in path.parts for excluded in self.EXCLUDED_DIRS):
-            return False
-        return path.suffix.lower() in self.TARGET_EXTENSIONS
-    def _read_file_content(self, file_path: Path) -> Optional[str]:
-        try:
-            # まずUTF-8で試す
-            try:
-                with file_path.open('r', encoding='utf-8') as f:
-                    return f.read()
-            except UnicodeDecodeError:
-                # UTF-8で失敗したらcp932を試す
-                with file_path.open('r', encoding='cp932') as f:
-                    return f.read()
-        except (OSError, UnicodeDecodeError):
-            return None
     def scan_files(self) -> List[FileInfo]:
         if not self.base_dir.exists():
-            raise FileNotFoundError(f"Directory not found: {self.base_dir}")
         files = []
-        for entry in self.base_dir.rglob('*'):
-            if entry.is_file() and self._should_scan_file(entry):
-                content = self._read_file_content(entry)
-                if content is not None:
                     files.append(FileInfo(
-                        path=entry.relative_to(self.base_dir),
-                        content=content
                     ))
         return sorted(files, key=lambda x: str(x.path))

 from pathlib import Path
 from typing import List, Dict, Optional
 from dataclasses import dataclass
+import chardet
 @dataclass
 class FileInfo:
     path: Path
+    size: int
+    extension: str
     content: Optional[str] = None
+    encoding: Optional[str] = None
+    @property
+    def formatted_size(self) -> str:
+        if self.size < 1024:
+            return f"{self.size} B"
+        elif self.size < 1024 * 1024:
+            return f"{self.size/1024:.1f} KB"
+        else:
+            return f"{self.size/(1024*1024):.1f} MB"
 class FileScanner:
+    TARGET_EXTENSIONS = {'.py', '.sh', '.rb', '.js', '.ts', '.java', '.cpp',
+                        '.hpp', '.c', '.h', '.go', '.rs', '.php', '.json',
+                        '.yml', '.yaml', '.toml', '.ini', '.md', '.txt'}
+    EXCLUDED_DIRS = {'.git', '__pycache__', 'node_modules', 'venv', '.env'}
+    MAX_FILE_SIZE = 1 * 1024 * 1024
     def __init__(self, base_dir: Path):
         self.base_dir = base_dir
     def scan_files(self) -> List[FileInfo]:
         if not self.base_dir.exists():
+            raise FileNotFoundError(f"ディレクトリが見つかりません: {self.base_dir}")
         files = []
+        for entry in self.base_dir.glob("**/*"):
+            if (entry.is_file() and
+                entry.suffix.lower() in self.TARGET_EXTENSIONS and
+                not any(excluded in entry.parts for excluded in self.EXCLUDED_DIRS) and
+                entry.stat().st_size <= self.MAX_FILE_SIZE):
+                try:
+                    with entry.open('rb') as f:
+                        raw_data = f.read(4096)
+                        encoding = chardet.detect(raw_data)['encoding'] or 'utf-8'
+                    with entry.open('r', encoding=encoding) as f:
+                        content = f.read()
                     files.append(FileInfo(
+                        path=entry.absolute(),
+                        size=entry.stat().st_size,
+                        extension=entry.suffix.lower(),
+                        content=content,
+                        encoding=encoding
                     ))
+                except:
+                    continue
         return sorted(files, key=lambda x: str(x.path))

services/llm_service.py CHANGED Viewed

@@ -1,5 +1,4 @@
-from typing import Optional, List, Dict, Any
-import openai
 import anthropic
 from dataclasses import dataclass
 from config.llm_settings import LLMSettings
@@ -14,26 +13,15 @@ class LLMService:
     MAX_TURNS = 5
     def __init__(self):
-        """LLMサービスの初期化"""
         self.settings = LLMSettings()
-        self.current_model = self.settings.default_llm
-        # API クライアントの初期化
-        if self.settings.anthropic_api_key:
-            self.claude_client = anthropic.Anthropic(api_key=self.settings.anthropic_api_key)
-        if self.settings.openai_api_key:
-            openai.api_key = self.settings.openai_api_key
         self.conversation_history: List[Message] = []
     def switch_model(self, model: str):
-        """使用するモデルを切り替え"""
-        if model not in self.settings.get_available_models():
-            raise ValueError(f"モデル {model} は利用できません")
-        self.current_model = model
     def create_prompt(self, content: str, query: str) -> str:
-        """プロンプトを生成"""
         return f"""以下はGitHubリポジトリのコード解析結果です。このコードについて質問に答えてください。
 コード解析結果:
@@ -44,54 +32,31 @@ class LLMService:
 できるだけ具体的に、コードの内容を参照しながら回答してください。"""
     def _add_to_history(self, role: str, content: str):
-        """会話履歴に追加（最大5ターン）"""
         self.conversation_history.append(Message(role=role, content=content))
-        # 最大ターン数を超えた場合、古い会話を削除
-        if len(self.conversation_history) > self.MAX_TURNS * 2:  # 各ターンは質問と回答で2メッセージ
             self.conversation_history = self.conversation_history[-self.MAX_TURNS * 2:]
     def _format_messages_for_claude(self) -> List[Dict[str, str]]:
-        """Claude用にメッセージをフォーマット"""
         return [{"role": msg.role, "content": msg.content}
                 for msg in self.conversation_history]
-    def _format_messages_for_gpt(self) -> List[Dict[str, str]]:
-        """GPT用にメッセージをフォーマット"""
-        return [
-            {"role": "system", "content": "あなたはコードアナリストとして、リポジトリの解析と質問への回答を行います。"},
-            *[{"role": msg.role, "content": msg.content}
-              for msg in self.conversation_history]
-        ]
     def get_conversation_history(self) -> List[Dict[str, str]]:
-        """会話履歴を取得"""
         return [{"role": msg.role, "content": msg.content}
                 for msg in self.conversation_history]
     def clear_history(self):
-        """会話履歴をクリア"""
         self.conversation_history = []
     def get_response(self, content: str, query: str) -> tuple[Optional[str], Optional[str]]:
-        """LLMを使用して回答を生成"""
         try:
             prompt = self.create_prompt(content, query)
             self._add_to_history("user", prompt)
-            if self.current_model == 'claude':
-                response = self.claude_client.messages.create(
-                    model="claude-3-sonnet-20240229",
-                    max_tokens=4000,
-                    messages=self._format_messages_for_claude()
-                )
-                answer = response.content[0].text
-            else:  # gpt
-                response = openai.ChatCompletion.create(
-                    model="gpt-4o",
-                    messages=self._format_messages_for_gpt()
-                )
-                answer = response.choices[0].message.content
             self._add_to_history("assistant", answer)
             return answer, None
@@ -101,7 +66,6 @@ class LLMService:
     @staticmethod
     def format_code_content(files: List[FileInfo]) -> str:
-        """ファイル内容をプロンプト用にフォーマット"""
         formatted_content = []
         for file_info in files:
             formatted_content.append(

+from typing import Optional, List, Dict
 import anthropic
 from dataclasses import dataclass
 from config.llm_settings import LLMSettings
     MAX_TURNS = 5
     def __init__(self):
         self.settings = LLMSettings()
+        self.claude_client = anthropic.Anthropic(api_key=self.settings.anthropic_api_key)
         self.conversation_history: List[Message] = []
     def switch_model(self, model: str):
+        if model.lower() != "claude":
+            raise ValueError("Only Claude model is available")
     def create_prompt(self, content: str, query: str) -> str:
         return f"""以下はGitHubリポジトリのコード解析結果です。このコードについて質問に答えてください。
 コード解析結果:
 できるだけ具体的に、コードの内容を参照しながら回答してください。"""
     def _add_to_history(self, role: str, content: str):
         self.conversation_history.append(Message(role=role, content=content))
+        if len(self.conversation_history) > self.MAX_TURNS * 2:
             self.conversation_history = self.conversation_history[-self.MAX_TURNS * 2:]
     def _format_messages_for_claude(self) -> List[Dict[str, str]]:
         return [{"role": msg.role, "content": msg.content}
                 for msg in self.conversation_history]
     def get_conversation_history(self) -> List[Dict[str, str]]:
         return [{"role": msg.role, "content": msg.content}
                 for msg in self.conversation_history]
     def clear_history(self):
         self.conversation_history = []
     def get_response(self, content: str, query: str) -> tuple[Optional[str], Optional[str]]:
         try:
             prompt = self.create_prompt(content, query)
             self._add_to_history("user", prompt)
+            response = self.claude_client.messages.create(
+                model="claude-3-5-sonnet-latest",
+                messages=self._format_messages_for_claude()
+            )
+            answer = response.content[0].text
             self._add_to_history("assistant", answer)
             return answer, None
     @staticmethod
     def format_code_content(files: List[FileInfo]) -> str:
         formatted_content = []
         for file_info in files:
             formatted_content.append(